
<!DOCTYPE html>

<html lang="en">
  <head>
    <meta charset="utf-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="generator" content="Docutils 0.18.1: http://docutils.sourceforge.net/" />

    <title>参考答案 &#8212; Joyful Pandas 1.0 documentation</title>
<script>
  document.documentElement.dataset.mode = localStorage.getItem("mode") || "";
  document.documentElement.dataset.theme = localStorage.getItem("theme") || "light"
</script>

  <!-- Loaded before other Sphinx assets -->
  <link href="../_static/styles/theme.css?digest=92025949c220c2e29695" rel="stylesheet">
<link href="../_static/styles/pydata-sphinx-theme.css?digest=92025949c220c2e29695" rel="stylesheet">


  <link rel="stylesheet"
    href="../_static/vendor/fontawesome/5.13.0/css/all.min.css">
  <link rel="preload" as="font" type="font/woff2" crossorigin
    href="../_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.woff2">
  <link rel="preload" as="font" type="font/woff2" crossorigin
    href="../_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.woff2">

    <link rel="stylesheet" type="text/css" href="../_static/pygments.css" />
    <link rel="stylesheet" type="text/css" href="../_static/plot_directive.css" />
    <link rel="stylesheet" type="text/css" href="../_static/css/s4defs-roles.css" />

  <!-- Pre-loaded scripts that we'll load fully later -->
  <link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=92025949c220c2e29695">

    <script data-url_root="../" id="documentation_options" src="../_static/documentation_options.js"></script>
    <script src="../_static/jquery.js"></script>
    <script src="../_static/underscore.js"></script>
    <script src="../_static/_sphinx_javascript_frameworks_compat.js"></script>
    <script src="../_static/doctools.js"></script>
    <script async="async" src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
    <link rel="index" title="Index" href="../genindex.html" />
    <link rel="search" title="Search" href="../search.html" />
    <link rel="next" title="Author" href="../Author.html" />
    <link rel="prev" title="第十章 时序数据" href="ch10.html" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="docsearch:language" content="en">
  </head>
  
  
  <body data-spy="scroll" data-target="#bd-toc-nav" data-offset="180" data-default-mode="">
    <div class="bd-header-announcement container-fluid" id="banner">
      

    </div>

    
    <nav class="bd-header navbar navbar-light navbar-expand-lg bg-light fixed-top bd-navbar" id="navbar-main"><div class="bd-header__inner container-xl">

  <div id="navbar-start">
    
    
  


<a class="navbar-brand logo" href="../index.html">
  
  
  
  
    <img src="../_static/finallogo1.svg" class="logo__image only-light" alt="Logo image">
    <img src="../_static/finallogo1.svg" class="logo__image only-dark" alt="Logo image">
  
  
</a>
    
  </div>

  <button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbar-collapsible" aria-controls="navbar-collapsible" aria-expanded="false" aria-label="Toggle navigation">
    <span class="fas fa-bars"></span>
  </button>

  
  <div id="navbar-collapsible" class="col-lg-9 collapse navbar-collapse">
    <div id="navbar-center" class="mr-auto">
      
      <div class="navbar-center-item">
        <ul id="navbar-main-elements" class="navbar-nav">
    <li class="toctree-l1 nav-item">
 <a class="reference internal nav-link" href="../Home.html">
  Home
 </a>
</li>

<li class="toctree-l1 current active nav-item">
 <a class="reference internal nav-link" href="index.html">
  Content
 </a>
</li>

<li class="toctree-l1 nav-item">
 <a class="reference internal nav-link" href="../Author.html">
  Author
 </a>
</li>

<li class="toctree-l1 nav-item">
 <a class="reference internal nav-link" href="../Datawhale.html">
  Datawhale
 </a>
</li>

<li class="toctree-l1 nav-item">
 <a class="reference internal nav-link" href="../pandas%E6%95%B0%E6%8D%AE%E5%A4%84%E7%90%86%E4%B8%8E%E5%88%86%E6%9E%90.html">
  pandas数据处理与分析
 </a>
</li>

<li class="toctree-l1 nav-item">
 <a class="reference internal nav-link" href="../%E8%A1%A5%E5%85%85%E4%B9%A0%E9%A2%98.html">
  补充习题
 </a>
</li>

    
    <li class="nav-item">
        <a class="nav-link nav-external" href="https://pandas.pydata.org/docs/index.html">Doc<i class="fas fa-external-link-alt"></i></a>
    </li>
    
</ul>
      </div>
      
    </div>

    <div id="navbar-end">
      
      <div class="navbar-end-item">
        <span id="theme-switch" class="btn btn-sm btn-outline-primary navbar-btn rounded-circle">
    <a class="theme-switch" data-mode="light"><i class="fas fa-sun"></i></a>
    <a class="theme-switch" data-mode="dark"><i class="far fa-moon"></i></a>
    <a class="theme-switch" data-mode="auto"><i class="fas fa-adjust"></i></a>
</span>
      </div>
      
      <div class="navbar-end-item">
        <ul id="navbar-icon-links" class="navbar-nav" aria-label="Icon Links">
        <li class="nav-item">
          <a class="nav-link" href="https://github.com/datawhalechina/joyful-pandas" rel="noopener" target="_blank" title="GitHub"><span><i class="fab fa-github-square"></i></span>
            <label class="sr-only">GitHub</label></a>
        </li>
      </ul>
      </div>
      
    </div>
  </div>
</div>
    </nav>
    

    <div class="bd-container container-xl">
      <div class="bd-container__inner row">
          

<!-- Only show if we have sidebars configured, else just a small margin  -->
<div class="bd-sidebar-primary col-12 col-md-3 bd-sidebar">
  <div class="sidebar-start-items"><form class="bd-search d-flex align-items-center" action="../search.html" method="get">
  <i class="icon fas fa-search"></i>
  <input type="search" class="form-control" name="q" id="search-input" placeholder="Search the docs ..." aria-label="Search the docs ..." autocomplete="off" >
</form><nav class="bd-links" id="bd-docs-nav" aria-label="Main navigation">
  <div class="bd-toc-item active">
    <ul class="current nav bd-sidenav">
 <li class="toctree-l1">
  <a class="reference internal" href="ch1.html">
   第一章 预备知识
  </a>
 </li>
 <li class="toctree-l1">
  <a class="reference internal" href="ch2.html">
   第二章 pandas基础
  </a>
 </li>
 <li class="toctree-l1">
  <a class="reference internal" href="ch3.html">
   第三章 索引
  </a>
 </li>
 <li class="toctree-l1">
  <a class="reference internal" href="ch4.html">
   第四章 分组
  </a>
 </li>
 <li class="toctree-l1">
  <a class="reference internal" href="ch5.html">
   第五章 变形
  </a>
 </li>
 <li class="toctree-l1">
  <a class="reference internal" href="ch6.html">
   第六章 连接
  </a>
 </li>
 <li class="toctree-l1">
  <a class="reference internal" href="ch7.html">
   第七章 缺失数据
  </a>
 </li>
 <li class="toctree-l1">
  <a class="reference internal" href="ch8.html">
   第八章 文本数据
  </a>
 </li>
 <li class="toctree-l1">
  <a class="reference internal" href="ch9.html">
   第九章 分类数据
  </a>
 </li>
 <li class="toctree-l1">
  <a class="reference internal" href="ch10.html">
   第十章 时序数据
  </a>
 </li>
 <li class="toctree-l1 current active">
  <a class="current reference internal" href="#">
   参考答案
  </a>
 </li>
</ul>

  </div>
</nav>
  </div>
  <div class="sidebar-end-items">
  </div>
</div>


          


<div class="bd-sidebar-secondary d-none d-xl-block col-xl-2 bd-toc">
  
    
    <div class="toc-item">
      
<div class="tocsection onthispage mt-5 pt-1 pb-3">
    <i class="fas fa-list"></i> On this page
</div>

<nav id="bd-toc-nav">
    <ul class="visible nav section-nav flex-column">
 <li class="toc-h2 nav-item toc-entry">
  <a class="reference internal nav-link" href="#id2">
   第一章 预备知识
  </a>
  <ul class="nav section-nav flex-column">
   <li class="toc-h3 nav-item toc-entry">
    <a class="reference internal nav-link" href="#ex1">
     Ex1：利用列表推导式写矩阵乘法
    </a>
   </li>
   <li class="toc-h3 nav-item toc-entry">
    <a class="reference internal nav-link" href="#ex2">
     Ex2：更新矩阵
    </a>
   </li>
   <li class="toc-h3 nav-item toc-entry">
    <a class="reference internal nav-link" href="#ex3">
     Ex3：卡方统计量
    </a>
   </li>
   <li class="toc-h3 nav-item toc-entry">
    <a class="reference internal nav-link" href="#ex4">
     Ex4：改进矩阵计算的性能
    </a>
   </li>
   <li class="toc-h3 nav-item toc-entry">
    <a class="reference internal nav-link" href="#ex5">
     Ex5：连续整数的最大长度
    </a>
   </li>
  </ul>
 </li>
 <li class="toc-h2 nav-item toc-entry">
  <a class="reference internal nav-link" href="#pandas">
   第二章 pandas基础
  </a>
  <ul class="nav section-nav flex-column">
   <li class="toc-h3 nav-item toc-entry">
    <a class="reference internal nav-link" href="#id3">
     Ex1：口袋妖怪数据集
    </a>
   </li>
   <li class="toc-h3 nav-item toc-entry">
    <a class="reference internal nav-link" href="#id4">
     Ex2：指数加权窗口
    </a>
   </li>
  </ul>
 </li>
 <li class="toc-h2 nav-item toc-entry">
  <a class="reference internal nav-link" href="#id5">
   第三章 索引
  </a>
  <ul class="nav section-nav flex-column">
   <li class="toc-h3 nav-item toc-entry">
    <a class="reference internal nav-link" href="#id6">
     Ex1：公司员工数据集
    </a>
   </li>
   <li class="toc-h3 nav-item toc-entry">
    <a class="reference internal nav-link" href="#id7">
     Ex2：巧克力数据集
    </a>
   </li>
  </ul>
 </li>
 <li class="toc-h2 nav-item toc-entry">
  <a class="reference internal nav-link" href="#id8">
   第四章 分组
  </a>
  <ul class="nav section-nav flex-column">
   <li class="toc-h3 nav-item toc-entry">
    <a class="reference internal nav-link" href="#id9">
     Ex1：汽车数据集
    </a>
   </li>
   <li class="toc-h3 nav-item toc-entry">
    <a class="reference internal nav-link" href="#ex2-transform">
     Ex2：实现transform函数
    </a>
   </li>
  </ul>
 </li>
 <li class="toc-h2 nav-item toc-entry">
  <a class="reference internal nav-link" href="#id10">
   第五章 变形
  </a>
  <ul class="nav section-nav flex-column">
   <li class="toc-h3 nav-item toc-entry">
    <a class="reference internal nav-link" href="#id11">
     Ex1：美国非法药物数据集
    </a>
   </li>
   <li class="toc-h3 nav-item toc-entry">
    <a class="reference internal nav-link" href="#ex2-wide-to-long">
     Ex2：特殊的wide_to_long方法
    </a>
   </li>
  </ul>
 </li>
 <li class="toc-h2 nav-item toc-entry">
  <a class="reference internal nav-link" href="#id12">
   第六章 连接
  </a>
  <ul class="nav section-nav flex-column">
   <li class="toc-h3 nav-item toc-entry">
    <a class="reference internal nav-link" href="#id13">
     Ex1：美国疫情数据集
    </a>
   </li>
   <li class="toc-h3 nav-item toc-entry">
    <a class="reference internal nav-link" href="#ex2-join">
     Ex2：实现join函数
    </a>
   </li>
  </ul>
 </li>
 <li class="toc-h2 nav-item toc-entry">
  <a class="reference internal nav-link" href="#id14">
   第七章 缺失数据
  </a>
  <ul class="nav section-nav flex-column">
   <li class="toc-h3 nav-item toc-entry">
    <a class="reference internal nav-link" href="#id15">
     Ex1：缺失值与类别的相关性检验
    </a>
   </li>
   <li class="toc-h3 nav-item toc-entry">
    <a class="reference internal nav-link" href="#id16">
     Ex2：用回归模型解决分类问题
    </a>
   </li>
  </ul>
 </li>
 <li class="toc-h2 nav-item toc-entry">
  <a class="reference internal nav-link" href="#id17">
   第八章 文本数据
  </a>
  <ul class="nav section-nav flex-column">
   <li class="toc-h3 nav-item toc-entry">
    <a class="reference internal nav-link" href="#id18">
     Ex1：房屋信息数据集
    </a>
   </li>
   <li class="toc-h3 nav-item toc-entry">
    <a class="reference internal nav-link" href="#id19">
     Ex2：《权力的游戏》剧本数据集
    </a>
   </li>
  </ul>
 </li>
 <li class="toc-h2 nav-item toc-entry">
  <a class="reference internal nav-link" href="#id20">
   第九章 分类数据
  </a>
  <ul class="nav section-nav flex-column">
   <li class="toc-h3 nav-item toc-entry">
    <a class="reference internal nav-link" href="#id21">
     Ex1：统计未出现的类别
    </a>
   </li>
   <li class="toc-h3 nav-item toc-entry">
    <a class="reference internal nav-link" href="#id22">
     Ex2：钻石数据集
    </a>
   </li>
  </ul>
 </li>
 <li class="toc-h2 nav-item toc-entry">
  <a class="reference internal nav-link" href="#id23">
   第十章 时序数据
  </a>
  <ul class="nav section-nav flex-column">
   <li class="toc-h3 nav-item toc-entry">
    <a class="reference internal nav-link" href="#id24">
     Ex1：太阳辐射数据集
    </a>
   </li>
   <li class="toc-h3 nav-item toc-entry">
    <a class="reference internal nav-link" href="#id25">
     Ex2：水果销量数据集
    </a>
   </li>
  </ul>
 </li>
</ul>

</nav>
    </div>
    
    <div class="toc-item">
      
    </div>
    
  
</div>


          
          
          <div class="bd-content col-12 col-md-9 col-xl-7">
              
              <article class="bd-article" role="main">
                
  <section id="id1">
<h1>参考答案<a class="headerlink" href="#id1" title="Permalink to this heading">#</a></h1>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [1]: </span><span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>

<span class="gp">In [2]: </span><span class="kn">import</span> <span class="nn">pandas</span> <span class="k">as</span> <span class="nn">pd</span>

<span class="gp">In [3]: </span><span class="kn">import</span> <span class="nn">matplotlib.pyplot</span> <span class="k">as</span> <span class="nn">plt</span>
</pre></div>
</div>
<section id="id2">
<h2>第一章 预备知识<a class="headerlink" href="#id2" title="Permalink to this heading">#</a></h2>
<section id="ex1">
<h3>Ex1：利用列表推导式写矩阵乘法<a class="headerlink" href="#ex1" title="Permalink to this heading">#</a></h3>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [4]: </span><span class="n">M1</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">rand</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span><span class="mi">3</span><span class="p">)</span>

<span class="gp">In [5]: </span><span class="n">M2</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">rand</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span><span class="mi">4</span><span class="p">)</span>

<span class="gp">In [6]: </span><span class="n">res</span> <span class="o">=</span> <span class="p">[[</span><span class="nb">sum</span><span class="p">([</span><span class="n">M1</span><span class="p">[</span><span class="n">i</span><span class="p">][</span><span class="n">k</span><span class="p">]</span> <span class="o">*</span> <span class="n">M2</span><span class="p">[</span><span class="n">k</span><span class="p">][</span><span class="n">j</span><span class="p">]</span> <span class="k">for</span> <span class="n">k</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">M1</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">])])</span> <span class="k">for</span> <span class="n">j</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">M2</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">])]</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">M1</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">0</span><span class="p">])]</span>

<span class="gp">In [7]: </span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">abs</span><span class="p">((</span><span class="n">M1</span><span class="nd">@M2</span> <span class="o">-</span> <span class="n">res</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mf">1e-15</span><span class="p">))</span><span class="o">.</span><span class="n">all</span><span class="p">()</span>
<span class="gh">Out[7]: </span><span class="go">True</span>
</pre></div>
</div>
</section>
<section id="ex2">
<h3>Ex2：更新矩阵<a class="headerlink" href="#ex2" title="Permalink to this heading">#</a></h3>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [8]: </span><span class="n">A</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">arange</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span><span class="mi">10</span><span class="p">)</span><span class="o">.</span><span class="n">reshape</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span>

<span class="gp">In [9]: </span><span class="n">B</span> <span class="o">=</span> <span class="n">A</span><span class="o">*</span><span class="p">(</span><span class="mi">1</span><span class="o">/</span><span class="n">A</span><span class="p">)</span><span class="o">.</span><span class="n">sum</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">reshape</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span><span class="mi">1</span><span class="p">)</span>

<span class="gp">In [10]: </span><span class="n">B</span>
<span class="gh">Out[10]: </span>
<span class="go">array([[1.83333333, 3.66666667, 5.5       ],</span>
<span class="go">       [2.46666667, 3.08333333, 3.7       ],</span>
<span class="go">       [2.65277778, 3.03174603, 3.41071429]])</span>
</pre></div>
</div>
</section>
<section id="ex3">
<h3>Ex3：卡方统计量<a class="headerlink" href="#ex3" title="Permalink to this heading">#</a></h3>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [11]: </span><span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">seed</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span>

<span class="gp">In [12]: </span><span class="n">A</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">randint</span><span class="p">(</span><span class="mi">10</span><span class="p">,</span> <span class="mi">20</span><span class="p">,</span> <span class="p">(</span><span class="mi">8</span><span class="p">,</span> <span class="mi">5</span><span class="p">))</span>

<span class="gp">In [13]: </span><span class="n">B</span> <span class="o">=</span> <span class="n">A</span><span class="o">.</span><span class="n">sum</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span><span class="o">*</span><span class="n">A</span><span class="o">.</span><span class="n">sum</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">reshape</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span><span class="o">/</span><span class="n">A</span><span class="o">.</span><span class="n">sum</span><span class="p">()</span>

<span class="gp">In [14]: </span><span class="n">res</span> <span class="o">=</span> <span class="p">((</span><span class="n">A</span><span class="o">-</span><span class="n">B</span><span class="p">)</span><span class="o">**</span><span class="mi">2</span><span class="o">/</span><span class="n">B</span><span class="p">)</span><span class="o">.</span><span class="n">sum</span><span class="p">()</span>

<span class="gp">In [15]: </span><span class="n">res</span>
<span class="gh">Out[15]: </span><span class="go">11.842696601945802</span>
</pre></div>
</div>
</section>
<section id="ex4">
<h3>Ex4：改进矩阵计算的性能<a class="headerlink" href="#ex4" title="Permalink to this heading">#</a></h3>
<p>原方法：</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [16]: </span><span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">seed</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span>

<span class="gp">In [17]: </span><span class="n">m</span><span class="p">,</span> <span class="n">n</span><span class="p">,</span> <span class="n">p</span> <span class="o">=</span> <span class="mi">100</span><span class="p">,</span> <span class="mi">80</span><span class="p">,</span> <span class="mi">50</span>

<span class="gp">In [18]: </span><span class="n">B</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">randint</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="p">(</span><span class="n">m</span><span class="p">,</span> <span class="n">p</span><span class="p">))</span>

<span class="gp">In [19]: </span><span class="n">U</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">randint</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="p">(</span><span class="n">p</span><span class="p">,</span> <span class="n">n</span><span class="p">))</span>

<span class="gp">In [20]: </span><span class="n">Z</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">randint</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="p">(</span><span class="n">m</span><span class="p">,</span> <span class="n">n</span><span class="p">))</span>
</pre></div>
</div>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [21]: </span><span class="k">def</span> <span class="nf">solution</span><span class="p">(</span><span class="n">B</span><span class="o">=</span><span class="n">B</span><span class="p">,</span> <span class="n">U</span><span class="o">=</span><span class="n">U</span><span class="p">,</span> <span class="n">Z</span><span class="o">=</span><span class="n">Z</span><span class="p">):</span>
<span class="gp">   ....: </span>    <span class="n">L_res</span> <span class="o">=</span> <span class="p">[]</span>
<span class="gp">   ....: </span>    <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">m</span><span class="p">):</span>
<span class="gp">   ....: </span>        <span class="k">for</span> <span class="n">j</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">n</span><span class="p">):</span>
<span class="gp">   ....: </span>            <span class="n">norm_value</span> <span class="o">=</span> <span class="p">((</span><span class="n">B</span><span class="p">[</span><span class="n">i</span><span class="p">]</span><span class="o">-</span><span class="n">U</span><span class="p">[:,</span><span class="n">j</span><span class="p">])</span><span class="o">**</span><span class="mi">2</span><span class="p">)</span><span class="o">.</span><span class="n">sum</span><span class="p">()</span>
<span class="gp">   ....: </span>            <span class="n">L_res</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">norm_value</span><span class="o">*</span><span class="n">Z</span><span class="p">[</span><span class="n">i</span><span class="p">][</span><span class="n">j</span><span class="p">])</span>
<span class="gp">   ....: </span>    <span class="k">return</span> <span class="nb">sum</span><span class="p">(</span><span class="n">L_res</span><span class="p">)</span>
<span class="gp">   ....: </span>

<span class="gp">In [22]: </span><span class="n">solution</span><span class="p">(</span><span class="n">B</span><span class="p">,</span> <span class="n">U</span><span class="p">,</span> <span class="n">Z</span><span class="p">)</span>
<span class="gh">Out[22]: </span><span class="go">100566</span>
</pre></div>
</div>
<p>改进方法：</p>
<p>令 <span class="math notranslate nohighlight">\(Y_{ij} = \|B_i-U_j\|_2^2\)</span> ，则 <span class="math notranslate nohighlight">\(\displaystyle R=\sum_{i=1}^m\sum_{j=1}^n Y_{ij}Z_{ij}\)</span> ，这在 <code class="docutils literal notranslate"><span class="pre">Numpy</span></code> 中可以用逐元素的乘法后求和实现，因此问题转化为了如何构造 <span class="math notranslate nohighlight">\(Y\)</span> 矩阵。</p>
<div class="math notranslate nohighlight">
\[\begin{split}Y_{ij} &amp;= \|B_i-U_j\|_2^2\\
&amp;=\sum_{k=1}^p(B_{ik}-U_{kj})^2\\
&amp;=\sum_{k=1}^p B_{ik}^2+\sum_{k=1}^p U_{kj}^2-2\sum_{k=1}^p B_{ik}U_{kj}\\\end{split}\]</div>
<p>从上式可以看出，第一第二项分别为 <span class="math notranslate nohighlight">\(B\)</span> 的行平方和与 <span class="math notranslate nohighlight">\(U\)</span> 的列平方和，第三项是两倍的内积。因此， <span class="math notranslate nohighlight">\(Y\)</span> 矩阵可以写为三个部分，第一个部分是 <span class="math notranslate nohighlight">\(m\times n\)</span> 的全 <span class="math notranslate nohighlight">\(1\)</span> 矩阵每行乘以 <span class="math notranslate nohighlight">\(B\)</span> 对应行的行平方和，第二个部分是相同大小的全 <span class="math notranslate nohighlight">\(1\)</span> 矩阵每列乘以 <span class="math notranslate nohighlight">\(U\)</span> 对应列的列平方和，第三个部分恰为 <span class="math notranslate nohighlight">\(B\)</span> 矩阵与 <span class="math notranslate nohighlight">\(U\)</span> 矩阵乘积的两倍。从而结果如下：</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [23]: </span><span class="p">(((</span><span class="n">B</span><span class="o">**</span><span class="mi">2</span><span class="p">)</span><span class="o">.</span><span class="n">sum</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">reshape</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span><span class="mi">1</span><span class="p">)</span> <span class="o">+</span> <span class="p">(</span><span class="n">U</span><span class="o">**</span><span class="mi">2</span><span class="p">)</span><span class="o">.</span><span class="n">sum</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> <span class="o">-</span> <span class="mi">2</span><span class="o">*</span><span class="n">B</span><span class="nd">@U</span><span class="p">)</span><span class="o">*</span><span class="n">Z</span><span class="p">)</span><span class="o">.</span><span class="n">sum</span><span class="p">()</span>
<span class="gh">Out[23]: </span><span class="go">100566</span>
</pre></div>
</div>
<p>对比它们的性能：</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [24]: </span><span class="o">%</span><span class="k">timeit</span> -n 30 solution(B, U, Z)
<span class="go">29.1 ms +- 2.51 ms per loop (mean +- std. dev. of 7 runs, 30 loops each)</span>
</pre></div>
</div>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [25]: </span><span class="o">%</span><span class="k">timeit</span> -n 30 ((np.ones((m,n))*(B**2).sum(1).reshape(-1,1) +\
<span class="gp">   ....: </span>                  <span class="n">np</span><span class="o">.</span><span class="n">ones</span><span class="p">((</span><span class="n">m</span><span class="p">,</span><span class="n">n</span><span class="p">))</span><span class="o">*</span><span class="p">(</span><span class="n">U</span><span class="o">**</span><span class="mi">2</span><span class="p">)</span><span class="o">.</span><span class="n">sum</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> <span class="o">-</span> <span class="mi">2</span><span class="o">*</span><span class="n">B</span><span class="nd">@U</span><span class="p">)</span><span class="o">*</span><span class="n">Z</span><span class="p">)</span><span class="o">.</span><span class="n">sum</span><span class="p">()</span>
<span class="gp">   ....: </span>
<span class="go">596 us +- 74.1 us per loop (mean +- std. dev. of 7 runs, 30 loops each)</span>
</pre></div>
</div>
</section>
<section id="ex5">
<h3>Ex5：连续整数的最大长度<a class="headerlink" href="#ex5" title="Permalink to this heading">#</a></h3>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [26]: </span><span class="n">f</span> <span class="o">=</span> <span class="k">lambda</span> <span class="n">x</span><span class="p">:</span><span class="n">np</span><span class="o">.</span><span class="n">diff</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">nonzero</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">r_</span><span class="p">[</span><span class="mi">1</span><span class="p">,</span><span class="n">np</span><span class="o">.</span><span class="n">diff</span><span class="p">(</span><span class="n">x</span><span class="p">)</span><span class="o">!=</span><span class="mi">1</span><span class="p">,</span><span class="mi">1</span><span class="p">]))</span><span class="o">.</span><span class="n">max</span><span class="p">()</span>

<span class="gp">In [27]: </span><span class="n">f</span><span class="p">([</span><span class="mi">1</span><span class="p">,</span><span class="mi">2</span><span class="p">,</span><span class="mi">5</span><span class="p">,</span><span class="mi">6</span><span class="p">,</span><span class="mi">7</span><span class="p">])</span>
<span class="gh">Out[27]: </span><span class="go">3</span>

<span class="gp">In [28]: </span><span class="n">f</span><span class="p">([</span><span class="mi">3</span><span class="p">,</span><span class="mi">2</span><span class="p">,</span><span class="mi">1</span><span class="p">,</span><span class="mi">2</span><span class="p">,</span><span class="mi">3</span><span class="p">,</span><span class="mi">4</span><span class="p">,</span><span class="mi">6</span><span class="p">])</span>
<span class="gh">Out[28]: </span><span class="go">4</span>
</pre></div>
</div>
</section>
</section>
<section id="pandas">
<h2>第二章 pandas基础<a class="headerlink" href="#pandas" title="Permalink to this heading">#</a></h2>
<section id="id3">
<h3>Ex1：口袋妖怪数据集<a class="headerlink" href="#id3" title="Permalink to this heading">#</a></h3>
<ol class="arabic simple">
<li></li>
</ol>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [29]: </span><span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s1">&#39;data/pokemon.csv&#39;</span><span class="p">)</span>

<span class="gp">In [30]: </span><span class="p">(</span><span class="n">df</span><span class="p">[[</span><span class="s1">&#39;HP&#39;</span><span class="p">,</span> <span class="s1">&#39;Attack&#39;</span><span class="p">,</span> <span class="s1">&#39;Defense&#39;</span><span class="p">,</span> <span class="s1">&#39;Sp. Atk&#39;</span><span class="p">,</span> <span class="s1">&#39;Sp. Def&#39;</span><span class="p">,</span> <span class="s1">&#39;Speed&#39;</span>
<span class="gp">   ....: </span>   <span class="p">]]</span><span class="o">.</span><span class="n">sum</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span><span class="o">!=</span><span class="n">df</span><span class="p">[</span><span class="s1">&#39;Total&#39;</span><span class="p">])</span><span class="o">.</span><span class="n">mean</span><span class="p">()</span>
<span class="gp">   ....: </span>
<span class="gh">Out[30]: </span><span class="go">0.0</span>
</pre></div>
</div>
<ol class="arabic simple" start="2">
<li></li>
</ol>
<ol class="loweralpha simple">
<li></li>
</ol>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [31]: </span><span class="n">dp_dup</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">drop_duplicates</span><span class="p">(</span><span class="s1">&#39;#&#39;</span><span class="p">,</span> <span class="n">keep</span><span class="o">=</span><span class="s1">&#39;first&#39;</span><span class="p">)</span>

<span class="gp">In [32]: </span><span class="n">dp_dup</span><span class="p">[</span><span class="s1">&#39;Type 1&#39;</span><span class="p">]</span><span class="o">.</span><span class="n">nunique</span><span class="p">()</span>
<span class="gh">Out[32]: </span><span class="go">18</span>

<span class="gp">In [33]: </span><span class="n">dp_dup</span><span class="p">[</span><span class="s1">&#39;Type 1&#39;</span><span class="p">]</span><span class="o">.</span><span class="n">value_counts</span><span class="p">()</span><span class="o">.</span><span class="n">index</span><span class="p">[:</span><span class="mi">3</span><span class="p">]</span>
<span class="gh">Out[33]: </span><span class="go">Index([&#39;Water&#39;, &#39;Normal&#39;, &#39;Grass&#39;], dtype=&#39;object&#39;)</span>
</pre></div>
</div>
<ol class="loweralpha simple" start="2">
<li></li>
</ol>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [34]: </span><span class="n">attr_dup</span> <span class="o">=</span> <span class="n">dp_dup</span><span class="o">.</span><span class="n">drop_duplicates</span><span class="p">([</span><span class="s1">&#39;Type 1&#39;</span><span class="p">,</span> <span class="s1">&#39;Type 2&#39;</span><span class="p">])</span>

<span class="gp">In [35]: </span><span class="n">attr_dup</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="gh">Out[35]: </span><span class="go">143</span>
</pre></div>
</div>
<ol class="loweralpha simple" start="3">
<li></li>
</ol>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [36]: </span><span class="n">L_full</span> <span class="o">=</span> <span class="p">[</span><span class="n">i</span><span class="o">+</span><span class="s1">&#39; &#39;</span><span class="o">+</span><span class="n">j</span> <span class="k">if</span> <span class="n">i</span><span class="o">!=</span><span class="n">j</span> <span class="k">else</span> <span class="n">i</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="n">df</span><span class="p">[</span><span class="s1">&#39;Type 1&#39;</span>
<span class="gp">   ....: </span>         <span class="p">]</span><span class="o">.</span><span class="n">unique</span><span class="p">()</span> <span class="k">for</span> <span class="n">j</span> <span class="ow">in</span> <span class="n">df</span><span class="p">[</span><span class="s1">&#39;Type 1&#39;</span><span class="p">]</span><span class="o">.</span><span class="n">unique</span><span class="p">()]</span>
<span class="gp">   ....: </span>

<span class="gp">In [37]: </span><span class="n">L_part</span> <span class="o">=</span> <span class="p">[</span><span class="n">i</span><span class="o">+</span><span class="s1">&#39; &#39;</span><span class="o">+</span><span class="n">j</span> <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">j</span><span class="p">,</span> <span class="nb">float</span><span class="p">)</span> <span class="k">else</span> <span class="n">i</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">j</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span>
<span class="gp">   ....: </span>          <span class="n">attr_dup</span><span class="p">[</span><span class="s1">&#39;Type 1&#39;</span><span class="p">],</span> <span class="n">attr_dup</span><span class="p">[</span><span class="s1">&#39;Type 2&#39;</span><span class="p">])]</span>
<span class="gp">   ....: </span>

<span class="gp">In [38]: </span><span class="n">res</span> <span class="o">=</span> <span class="nb">set</span><span class="p">(</span><span class="n">L_full</span><span class="p">)</span><span class="o">.</span><span class="n">difference</span><span class="p">(</span><span class="nb">set</span><span class="p">(</span><span class="n">L_part</span><span class="p">))</span>

<span class="gp">In [39]: </span><span class="nb">len</span><span class="p">(</span><span class="n">res</span><span class="p">)</span> <span class="c1"># 太多，不打印了</span>
<span class="gh">Out[39]: </span><span class="go">181</span>
</pre></div>
</div>
<ol class="arabic simple" start="3">
<li></li>
</ol>
<ol class="loweralpha simple">
<li></li>
</ol>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [40]: </span><span class="n">df</span><span class="p">[</span><span class="s1">&#39;Attack&#39;</span><span class="p">]</span><span class="o">.</span><span class="n">mask</span><span class="p">(</span><span class="n">df</span><span class="p">[</span><span class="s1">&#39;Attack&#39;</span><span class="p">]</span><span class="o">&gt;</span><span class="mi">120</span><span class="p">,</span> <span class="s1">&#39;high&#39;</span>
<span class="gp">   ....: </span>                 <span class="p">)</span><span class="o">.</span><span class="n">mask</span><span class="p">(</span><span class="n">df</span><span class="p">[</span><span class="s1">&#39;Attack&#39;</span><span class="p">]</span><span class="o">&lt;</span><span class="mi">50</span><span class="p">,</span> <span class="s1">&#39;low&#39;</span><span class="p">)</span><span class="o">.</span><span class="n">mask</span><span class="p">((</span><span class="mi">50</span><span class="o">&lt;=</span><span class="n">df</span><span class="p">[</span><span class="s1">&#39;Attack&#39;</span><span class="p">]</span>
<span class="gp">   ....: </span>                 <span class="p">)</span><span class="o">&amp;</span><span class="p">(</span><span class="n">df</span><span class="p">[</span><span class="s1">&#39;Attack&#39;</span><span class="p">]</span><span class="o">&lt;=</span><span class="mi">120</span><span class="p">),</span> <span class="s1">&#39;mid&#39;</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
<span class="gp">   ....: </span>
<span class="gh">Out[40]: </span>
<span class="go">0    low</span>
<span class="go">1    mid</span>
<span class="go">2    mid</span>
<span class="go">3    mid</span>
<span class="go">4    mid</span>
<span class="go">Name: Attack, dtype: object</span>
</pre></div>
</div>
<ol class="loweralpha simple" start="2">
<li></li>
</ol>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [41]: </span><span class="n">df</span><span class="p">[</span><span class="s1">&#39;Type 1&#39;</span><span class="p">]</span><span class="o">.</span><span class="n">replace</span><span class="p">({</span><span class="n">i</span><span class="p">:</span><span class="nb">str</span><span class="o">.</span><span class="n">upper</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="n">df</span><span class="p">[</span><span class="s1">&#39;Type 1&#39;</span>
<span class="gp">   ....: </span>            <span class="p">]</span><span class="o">.</span><span class="n">unique</span><span class="p">()})</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
<span class="gp">   ....: </span>
<span class="gh">Out[41]: </span>
<span class="go">0    GRASS</span>
<span class="go">1    GRASS</span>
<span class="go">2    GRASS</span>
<span class="go">3    GRASS</span>
<span class="go">4     FIRE</span>
<span class="go">Name: Type 1, dtype: object</span>

<span class="gp">In [42]: </span><span class="n">df</span><span class="p">[</span><span class="s1">&#39;Type 1&#39;</span><span class="p">]</span><span class="o">.</span><span class="n">apply</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span><span class="nb">str</span><span class="o">.</span><span class="n">upper</span><span class="p">(</span><span class="n">x</span><span class="p">))</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
<span class="gh">Out[42]: </span>
<span class="go">0    GRASS</span>
<span class="go">1    GRASS</span>
<span class="go">2    GRASS</span>
<span class="go">3    GRASS</span>
<span class="go">4     FIRE</span>
<span class="go">Name: Type 1, dtype: object</span>
</pre></div>
</div>
<ol class="loweralpha simple" start="3">
<li></li>
</ol>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [43]: </span><span class="n">df</span><span class="p">[</span><span class="s1">&#39;Deviation&#39;</span><span class="p">]</span> <span class="o">=</span> <span class="n">df</span><span class="p">[[</span><span class="s1">&#39;HP&#39;</span><span class="p">,</span> <span class="s1">&#39;Attack&#39;</span><span class="p">,</span> <span class="s1">&#39;Defense&#39;</span><span class="p">,</span> <span class="s1">&#39;Sp. Atk&#39;</span><span class="p">,</span>
<span class="gp">   ....: </span>                     <span class="s1">&#39;Sp. Def&#39;</span><span class="p">,</span> <span class="s1">&#39;Speed&#39;</span><span class="p">]]</span><span class="o">.</span><span class="n">apply</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span><span class="n">np</span><span class="o">.</span><span class="n">max</span><span class="p">(</span>
<span class="gp">   ....: </span>                     <span class="p">(</span><span class="n">x</span><span class="o">-</span><span class="n">x</span><span class="o">.</span><span class="n">median</span><span class="p">())</span><span class="o">.</span><span class="n">abs</span><span class="p">()),</span> <span class="mi">1</span><span class="p">)</span>
<span class="gp">   ....: </span>

<span class="gp">In [44]: </span><span class="n">df</span><span class="o">.</span><span class="n">sort_values</span><span class="p">(</span><span class="s1">&#39;Deviation&#39;</span><span class="p">,</span> <span class="n">ascending</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
<span class="gh">Out[44]: </span>
<span class="go">       #                 Name  Type 1  Type 2  Total   HP  Attack  Defense  Sp. Atk  Sp. Def  Speed  Deviation</span>
<span class="go">230  213              Shuckle     Bug    Rock    505   20      10      230       10      230      5      215.0</span>
<span class="go">121  113              Chansey  Normal     NaN    450  250       5        5       35      105     50      207.5</span>
<span class="go">261  242              Blissey  Normal     NaN    540  255      10       10       75      135     55      190.0</span>
<span class="go">333  306    AggronMega Aggron   Steel     NaN    630   70     140      230       60       80     50      155.0</span>
<span class="go">224  208  SteelixMega Steelix   Steel  Ground    610   75     125      230       55       95     30      145.0</span>
</pre></div>
</div>
</section>
<section id="id4">
<h3>Ex2：指数加权窗口<a class="headerlink" href="#id4" title="Permalink to this heading">#</a></h3>
<ol class="arabic simple">
<li></li>
</ol>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [45]: </span><span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">seed</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span>

<span class="gp">In [46]: </span><span class="n">s</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">randint</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span><span class="mi">2</span><span class="p">,</span><span class="mi">30</span><span class="p">)</span><span class="o">.</span><span class="n">cumsum</span><span class="p">())</span>

<span class="gp">In [47]: </span><span class="n">s</span><span class="o">.</span><span class="n">ewm</span><span class="p">(</span><span class="n">alpha</span><span class="o">=</span><span class="mf">0.2</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">()</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
<span class="gh">Out[47]: </span>
<span class="go">0   -1.000000</span>
<span class="go">1   -1.000000</span>
<span class="go">2   -1.409836</span>
<span class="go">3   -1.609756</span>
<span class="go">4   -1.725845</span>
<span class="go">dtype: float64</span>

<span class="gp">In [48]: </span><span class="k">def</span> <span class="nf">ewm_func</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">alpha</span><span class="o">=</span><span class="mf">0.2</span><span class="p">):</span>
<span class="gp">   ....: </span>    <span class="n">win</span> <span class="o">=</span> <span class="p">(</span><span class="mi">1</span><span class="o">-</span><span class="n">alpha</span><span class="p">)</span><span class="o">**</span><span class="n">np</span><span class="o">.</span><span class="n">arange</span><span class="p">(</span><span class="n">x</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">0</span><span class="p">])[::</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span>
<span class="gp">   ....: </span>    <span class="n">res</span> <span class="o">=</span> <span class="p">(</span><span class="n">win</span><span class="o">*</span><span class="n">x</span><span class="p">)</span><span class="o">.</span><span class="n">sum</span><span class="p">()</span><span class="o">/</span><span class="n">win</span><span class="o">.</span><span class="n">sum</span><span class="p">()</span>
<span class="gp">   ....: </span>    <span class="k">return</span> <span class="n">res</span>
<span class="gp">   ....: </span>

<span class="gp">In [49]: </span><span class="n">s</span><span class="o">.</span><span class="n">expanding</span><span class="p">()</span><span class="o">.</span><span class="n">apply</span><span class="p">(</span><span class="n">ewm_func</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
<span class="gh">Out[49]: </span>
<span class="go">0   -1.000000</span>
<span class="go">1   -1.000000</span>
<span class="go">2   -1.409836</span>
<span class="go">3   -1.609756</span>
<span class="go">4   -1.725845</span>
<span class="go">dtype: float64</span>
</pre></div>
</div>
<ol class="arabic simple" start="2">
<li></li>
</ol>
<p>新的权重为 <span class="math notranslate nohighlight">\(w_i = (1 - \alpha)^i, i\in \{0,1,...,n-1\}\)</span> ，<span class="math notranslate nohighlight">\(y_t\)</span> 更新如下：</p>
<div class="math notranslate nohighlight">
\[\begin{split}y_t &amp;=\frac{\sum_{i=0}^{n-1} w_i x_{t-i}}{\sum_{i=0}^{n-1} w_i} \\
&amp;=\frac{x_t + (1 - \alpha)x_{t-1} + (1 - \alpha)^2 x_{t-2} + ...
+ (1 - \alpha)^{n-1} x_{t-(n-1)}}{1 + (1 - \alpha) + (1 - \alpha)^2 + ...
+ (1 - \alpha)^{n-1}}\\\end{split}\]</div>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [50]: </span><span class="n">s</span><span class="o">.</span><span class="n">rolling</span><span class="p">(</span><span class="n">window</span><span class="o">=</span><span class="mi">4</span><span class="p">)</span><span class="o">.</span><span class="n">apply</span><span class="p">(</span><span class="n">ewm_func</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span> <span class="c1"># 无需对原函数改动</span>
<span class="gh">Out[50]: </span>
<span class="go">0         NaN</span>
<span class="go">1         NaN</span>
<span class="go">2         NaN</span>
<span class="go">3   -1.609756</span>
<span class="go">4   -1.826558</span>
<span class="go">dtype: float64</span>
</pre></div>
</div>
</section>
</section>
<section id="id5">
<h2>第三章 索引<a class="headerlink" href="#id5" title="Permalink to this heading">#</a></h2>
<section id="id6">
<h3>Ex1：公司员工数据集<a class="headerlink" href="#id6" title="Permalink to this heading">#</a></h3>
<ol class="arabic simple">
<li></li>
</ol>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [51]: </span><span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s1">&#39;data/company.csv&#39;</span><span class="p">)</span>

<span class="gp">In [52]: </span><span class="n">dpt</span> <span class="o">=</span> <span class="p">[</span><span class="s1">&#39;Dairy&#39;</span><span class="p">,</span> <span class="s1">&#39;Bakery&#39;</span><span class="p">]</span>

<span class="gp">In [53]: </span><span class="n">df</span><span class="o">.</span><span class="n">query</span><span class="p">(</span><span class="s2">&quot;(age &lt;= 40)&amp;(department == @dpt)&amp;(gender==&#39;M&#39;)&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="mi">3</span><span class="p">)</span>
<span class="gh">Out[53]: </span>
<span class="go">      EmployeeID birthdate_key  age city_name department     job_title gender</span>
<span class="go">3611        5791     1/14/1975   40   Kelowna      Dairy  Dairy Person      M</span>
<span class="go">3613        5793     1/22/1975   40  Richmond     Bakery         Baker      M</span>
<span class="go">3615        5795     1/30/1975   40   Nanaimo      Dairy  Dairy Person      M</span>

<span class="gp">In [54]: </span><span class="n">df</span><span class="o">.</span><span class="n">loc</span><span class="p">[(</span><span class="n">df</span><span class="o">.</span><span class="n">age</span><span class="o">&lt;=</span><span class="mi">40</span><span class="p">)</span><span class="o">&amp;</span><span class="n">df</span><span class="o">.</span><span class="n">department</span><span class="o">.</span><span class="n">isin</span><span class="p">(</span><span class="n">dpt</span><span class="p">)</span><span class="o">&amp;</span><span class="p">(</span><span class="n">df</span><span class="o">.</span><span class="n">gender</span><span class="o">==</span><span class="s1">&#39;M&#39;</span><span class="p">)]</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="mi">3</span><span class="p">)</span>
<span class="gh">Out[54]: </span>
<span class="go">      EmployeeID birthdate_key  age city_name department     job_title gender</span>
<span class="go">3611        5791     1/14/1975   40   Kelowna      Dairy  Dairy Person      M</span>
<span class="go">3613        5793     1/22/1975   40  Richmond     Bakery         Baker      M</span>
<span class="go">3615        5795     1/30/1975   40   Nanaimo      Dairy  Dairy Person      M</span>
</pre></div>
</div>
<ol class="arabic simple" start="2">
<li></li>
</ol>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [55]: </span><span class="n">df</span><span class="o">.</span><span class="n">iloc</span><span class="p">[(</span><span class="n">df</span><span class="o">.</span><span class="n">EmployeeID</span><span class="o">%</span><span class="k">2</span>==1).values,[0,2,-2]].head()
<span class="gh">Out[55]: </span>
<span class="go">   EmployeeID  age                      job_title</span>
<span class="go">1        1319   58                      VP Stores</span>
<span class="go">3        1321   56             VP Human Resources</span>
<span class="go">5        1323   53      Exec Assistant, VP Stores</span>
<span class="go">6        1325   51  Exec Assistant, Legal Counsel</span>
<span class="go">8        1329   48                  Store Manager</span>
</pre></div>
</div>
<ol class="arabic simple" start="3">
<li></li>
</ol>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [56]: </span><span class="n">df_op</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span>

<span class="gp">In [57]: </span><span class="n">df_op</span> <span class="o">=</span> <span class="n">df_op</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="n">df_op</span><span class="o">.</span><span class="n">columns</span><span class="p">[</span><span class="o">-</span><span class="mi">3</span><span class="p">:]</span><span class="o">.</span><span class="n">tolist</span><span class="p">())</span><span class="o">.</span><span class="n">swaplevel</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="mi">2</span><span class="p">,</span><span class="n">axis</span><span class="o">=</span><span class="mi">0</span><span class="p">)</span>

<span class="gp">In [58]: </span><span class="n">df_op</span> <span class="o">=</span> <span class="n">df_op</span><span class="o">.</span><span class="n">reset_index</span><span class="p">(</span><span class="n">level</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>

<span class="gp">In [59]: </span><span class="n">df_op</span> <span class="o">=</span> <span class="n">df_op</span><span class="o">.</span><span class="n">rename_axis</span><span class="p">(</span><span class="n">index</span><span class="o">=</span><span class="p">{</span><span class="s1">&#39;gender&#39;</span><span class="p">:</span><span class="s1">&#39;Gender&#39;</span><span class="p">})</span>

<span class="gp">In [60]: </span><span class="n">df_op</span><span class="o">.</span><span class="n">index</span> <span class="o">=</span> <span class="n">df_op</span><span class="o">.</span><span class="n">index</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span><span class="s1">&#39;_&#39;</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">x</span><span class="p">))</span>

<span class="gp">In [61]: </span><span class="n">df_op</span><span class="o">.</span><span class="n">index</span> <span class="o">=</span> <span class="n">df_op</span><span class="o">.</span><span class="n">index</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span><span class="nb">tuple</span><span class="p">(</span><span class="n">x</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s1">&#39;_&#39;</span><span class="p">)))</span>

<span class="gp">In [62]: </span><span class="n">df_op</span> <span class="o">=</span> <span class="n">df_op</span><span class="o">.</span><span class="n">rename_axis</span><span class="p">(</span><span class="n">index</span><span class="o">=</span><span class="p">[</span><span class="s1">&#39;gender&#39;</span><span class="p">,</span> <span class="s1">&#39;department&#39;</span><span class="p">])</span>

<span class="gp">In [63]: </span><span class="n">df_op</span> <span class="o">=</span> <span class="n">df_op</span><span class="o">.</span><span class="n">reset_index</span><span class="p">()</span><span class="o">.</span><span class="n">reindex</span><span class="p">(</span><span class="n">df</span><span class="o">.</span><span class="n">columns</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>

<span class="gp">In [64]: </span><span class="n">df_op</span><span class="o">.</span><span class="n">equals</span><span class="p">(</span><span class="n">df</span><span class="p">)</span>
<span class="gh">Out[64]: </span><span class="go">True</span>
</pre></div>
</div>
</section>
<section id="id7">
<h3>Ex2：巧克力数据集<a class="headerlink" href="#id7" title="Permalink to this heading">#</a></h3>
<ol class="arabic simple">
<li></li>
</ol>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [65]: </span><span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s1">&#39;data/chocolate.csv&#39;</span><span class="p">)</span>

<span class="gp">In [66]: </span><span class="n">df</span><span class="o">.</span><span class="n">columns</span> <span class="o">=</span> <span class="p">[</span><span class="s1">&#39; &#39;</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">i</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s1">&#39;</span><span class="se">\r\n</span><span class="s1">&#39;</span><span class="p">))</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="n">df</span><span class="o">.</span><span class="n">columns</span><span class="p">]</span>

<span class="gp">In [67]: </span><span class="n">df</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="mi">3</span><span class="p">)</span>
<span class="gh">Out[67]: </span>
<span class="go">    Company  Review Date Cocoa Percent Company Location  Rating</span>
<span class="go">0  A. Morin         2016           63%           France    3.75</span>
<span class="go">1  A. Morin         2015           70%           France    2.75</span>
<span class="go">2  A. Morin         2015           70%           France    3.00</span>
</pre></div>
</div>
<ol class="arabic simple" start="2">
<li></li>
</ol>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [68]: </span><span class="n">df</span><span class="p">[</span><span class="s1">&#39;Cocoa Percent&#39;</span><span class="p">]</span> <span class="o">=</span> <span class="n">df</span><span class="p">[</span><span class="s1">&#39;Cocoa Percent&#39;</span><span class="p">]</span><span class="o">.</span><span class="n">apply</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span><span class="nb">float</span><span class="p">(</span><span class="n">x</span><span class="p">[:</span><span class="o">-</span><span class="mi">1</span><span class="p">])</span><span class="o">/</span><span class="mi">100</span><span class="p">)</span>

<span class="gp">In [69]: </span><span class="n">df</span><span class="o">.</span><span class="n">query</span><span class="p">(</span><span class="s1">&#39;(Rating&lt;3)&amp;(`Cocoa Percent`&gt;`Cocoa Percent`.median())&#39;</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="mi">3</span><span class="p">)</span>
<span class="gh">Out[69]: </span>
<span class="go">               Company  Review Date  Cocoa Percent Company Location  Rating</span>
<span class="go">33  Akesson&#39;s (Pralus)         2010           0.75      Switzerland    2.75</span>
<span class="go">34  Akesson&#39;s (Pralus)         2010           0.75      Switzerland    2.75</span>
<span class="go">36       Alain Ducasse         2014           0.75           France    2.75</span>
</pre></div>
</div>
<ol class="arabic simple" start="3">
<li></li>
</ol>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [70]: </span><span class="n">idx</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">IndexSlice</span>

<span class="gp">In [71]: </span><span class="n">exclude</span> <span class="o">=</span> <span class="p">[</span><span class="s1">&#39;France&#39;</span><span class="p">,</span> <span class="s1">&#39;Canada&#39;</span><span class="p">,</span> <span class="s1">&#39;Amsterdam&#39;</span><span class="p">,</span> <span class="s1">&#39;Belgium&#39;</span><span class="p">]</span>

<span class="gp">In [72]: </span><span class="n">res</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">set_index</span><span class="p">([</span><span class="s1">&#39;Review Date&#39;</span><span class="p">,</span> <span class="s1">&#39;Company Location&#39;</span><span class="p">])</span><span class="o">.</span><span class="n">sort_index</span><span class="p">(</span><span class="n">level</span><span class="o">=</span><span class="mi">0</span><span class="p">)</span>

<span class="gp">In [73]: </span><span class="n">res</span><span class="o">.</span><span class="n">loc</span><span class="p">[</span><span class="n">idx</span><span class="p">[</span><span class="mi">2012</span><span class="p">:,</span><span class="o">~</span><span class="n">res</span><span class="o">.</span><span class="n">index</span><span class="o">.</span><span class="n">get_level_values</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">isin</span><span class="p">(</span><span class="n">exclude</span><span class="p">)],:]</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="mi">3</span><span class="p">)</span>
<span class="gh">Out[73]: </span>
<span class="go">                                  Company  Cocoa Percent  Rating</span>
<span class="go">Review Date Company Location                                    </span>
<span class="go">2012        Australia         Bahen &amp; Co.            0.7     3.0</span>
<span class="go">            Australia         Bahen &amp; Co.            0.7     2.5</span>
<span class="go">            Australia         Bahen &amp; Co.            0.7     2.5</span>
</pre></div>
</div>
</section>
</section>
<section id="id8">
<h2>第四章 分组<a class="headerlink" href="#id8" title="Permalink to this heading">#</a></h2>
<section id="id9">
<h3>Ex1：汽车数据集<a class="headerlink" href="#id9" title="Permalink to this heading">#</a></h3>
<p>现有一份关于汽车的数据集，其中 <code class="docutils literal notranslate"><span class="pre">Brand,</span> <span class="pre">Disp.,</span> <span class="pre">HP</span></code> 分别代表汽车品牌、发动机蓄量、发动机输出。</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [74]: </span><span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s1">&#39;data/car.csv&#39;</span><span class="p">)</span>

<span class="gp">In [75]: </span><span class="n">df</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="mi">3</span><span class="p">)</span>
<span class="gh">Out[75]: </span>
<span class="go">             Brand  Price Country  Reliability  Mileage   Type  Weight  Disp.   HP</span>
<span class="go">0   Eagle Summit 4   8895     USA          4.0       33  Small    2560     97  113</span>
<span class="go">1  Ford Escort   4   7402     USA          2.0       33  Small    2345    114   90</span>
<span class="go">2   Ford Festiva 4   6319   Korea          4.0       37  Small    1845     81   63</span>
</pre></div>
</div>
<ol class="arabic simple">
<li></li>
</ol>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [76]: </span><span class="n">df</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="s1">&#39;Country&#39;</span><span class="p">)</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span><span class="n">x</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">&gt;</span><span class="mi">2</span><span class="p">)</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span>
<span class="gp">   ....: </span>           <span class="s1">&#39;Country&#39;</span><span class="p">)[</span><span class="s1">&#39;Price&#39;</span><span class="p">]</span><span class="o">.</span><span class="n">agg</span><span class="p">([(</span>
<span class="gp">   ....: </span>           <span class="s1">&#39;CoV&#39;</span><span class="p">,</span> <span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="o">.</span><span class="n">std</span><span class="p">()</span><span class="o">/</span><span class="n">x</span><span class="o">.</span><span class="n">mean</span><span class="p">()),</span> <span class="s1">&#39;mean&#39;</span><span class="p">,</span> <span class="s1">&#39;count&#39;</span><span class="p">])</span>
<span class="gp">   ....: </span>
<span class="gh">Out[76]: </span>
<span class="go">                CoV          mean  count</span>
<span class="go">Country                                 </span>
<span class="go">Japan      0.387429  13938.052632     19</span>
<span class="go">Japan/USA  0.240040  10067.571429      7</span>
<span class="go">Korea      0.243435   7857.333333      3</span>
<span class="go">USA        0.203344  12543.269231     26</span>
</pre></div>
</div>
<ol class="arabic simple" start="2">
<li></li>
</ol>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [77]: </span><span class="n">df</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="gh">Out[77]: </span><span class="go">60</span>

<span class="gp">In [78]: </span><span class="n">condition</span> <span class="o">=</span> <span class="p">[</span><span class="s1">&#39;Head&#39;</span><span class="p">]</span><span class="o">*</span><span class="mi">20</span><span class="o">+</span><span class="p">[</span><span class="s1">&#39;Mid&#39;</span><span class="p">]</span><span class="o">*</span><span class="mi">20</span><span class="o">+</span><span class="p">[</span><span class="s1">&#39;Tail&#39;</span><span class="p">]</span><span class="o">*</span><span class="mi">20</span>

<span class="gp">In [79]: </span><span class="n">df</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="n">condition</span><span class="p">)[</span><span class="s1">&#39;Price&#39;</span><span class="p">]</span><span class="o">.</span><span class="n">mean</span><span class="p">()</span>
<span class="gh">Out[79]: </span>
<span class="go">Head     9069.95</span>
<span class="go">Mid     13356.40</span>
<span class="go">Tail    15420.65</span>
<span class="go">Name: Price, dtype: float64</span>
</pre></div>
</div>
<ol class="arabic simple" start="3">
<li></li>
</ol>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [80]: </span><span class="n">res</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="s1">&#39;Type&#39;</span><span class="p">)</span><span class="o">.</span><span class="n">agg</span><span class="p">({</span><span class="s1">&#39;Price&#39;</span><span class="p">:</span> <span class="p">[</span><span class="s1">&#39;max&#39;</span><span class="p">],</span> <span class="s1">&#39;HP&#39;</span><span class="p">:</span> <span class="p">[</span><span class="s1">&#39;min&#39;</span><span class="p">]})</span>

<span class="gp">In [81]: </span><span class="n">res</span><span class="o">.</span><span class="n">columns</span> <span class="o">=</span> <span class="n">res</span><span class="o">.</span><span class="n">columns</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span><span class="s1">&#39;_&#39;</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">x</span><span class="p">))</span>

<span class="gp">In [82]: </span><span class="n">res</span>
<span class="gh">Out[82]: </span>
<span class="go">         Price_max  HP_min</span>
<span class="go">Type                      </span>
<span class="go">Compact      18900      95</span>
<span class="go">Large        17257     150</span>
<span class="go">Medium       24760     110</span>
<span class="go">Small         9995      63</span>
<span class="go">Sporty       13945      92</span>
<span class="go">Van          15395     106</span>
</pre></div>
</div>
<ol class="arabic simple" start="4">
<li></li>
</ol>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [83]: </span><span class="k">def</span> <span class="nf">normalize</span><span class="p">(</span><span class="n">s</span><span class="p">):</span>
<span class="gp">   ....: </span>    <span class="n">s_min</span><span class="p">,</span> <span class="n">s_max</span> <span class="o">=</span> <span class="n">s</span><span class="o">.</span><span class="n">min</span><span class="p">(),</span> <span class="n">s</span><span class="o">.</span><span class="n">max</span><span class="p">()</span>
<span class="gp">   ....: </span>    <span class="n">res</span> <span class="o">=</span> <span class="p">(</span><span class="n">s</span> <span class="o">-</span> <span class="n">s_min</span><span class="p">)</span><span class="o">/</span><span class="p">(</span><span class="n">s_max</span> <span class="o">-</span> <span class="n">s_min</span><span class="p">)</span>
<span class="gp">   ....: </span>    <span class="k">return</span> <span class="n">res</span>
<span class="gp">   ....: </span>

<span class="gp">In [84]: </span><span class="n">df</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="s1">&#39;Type&#39;</span><span class="p">)[</span><span class="s1">&#39;HP&#39;</span><span class="p">]</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">normalize</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
<span class="gh">Out[84]: </span>
<span class="go">0    1.00</span>
<span class="go">1    0.54</span>
<span class="go">2    0.00</span>
<span class="go">3    0.58</span>
<span class="go">4    0.80</span>
<span class="go">Name: HP, dtype: float64</span>
</pre></div>
</div>
<ol class="arabic simple" start="5">
<li></li>
</ol>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [85]: </span><span class="n">df</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="s1">&#39;Type&#39;</span><span class="p">)[[</span><span class="s1">&#39;HP&#39;</span><span class="p">,</span> <span class="s1">&#39;Disp.&#39;</span><span class="p">]]</span><span class="o">.</span><span class="n">apply</span><span class="p">(</span>
<span class="gp">   ....: </span>   <span class="k">lambda</span> <span class="n">x</span><span class="p">:</span><span class="n">np</span><span class="o">.</span><span class="n">corrcoef</span><span class="p">(</span><span class="n">x</span><span class="p">[</span><span class="s1">&#39;HP&#39;</span><span class="p">]</span><span class="o">.</span><span class="n">values</span><span class="p">,</span> <span class="n">x</span><span class="p">[</span><span class="s1">&#39;Disp.&#39;</span><span class="p">]</span><span class="o">.</span><span class="n">values</span><span class="p">)[</span><span class="mi">0</span><span class="p">,</span><span class="mi">1</span><span class="p">])</span>
<span class="gp">   ....: </span>
<span class="gh">Out[85]: </span>
<span class="go">Type</span>
<span class="go">Compact    0.586087</span>
<span class="go">Large     -0.242765</span>
<span class="go">Medium     0.370491</span>
<span class="go">Small      0.603916</span>
<span class="go">Sporty     0.871426</span>
<span class="go">Van        0.819881</span>
<span class="go">dtype: float64</span>
</pre></div>
</div>
</section>
<section id="ex2-transform">
<h3>Ex2：实现transform函数<a class="headerlink" href="#ex2-transform" title="Permalink to this heading">#</a></h3>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [86]: </span><span class="k">class</span> <span class="nc">my_groupby</span><span class="p">:</span>
<span class="gp">   ....: </span>    <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">my_df</span><span class="p">,</span> <span class="n">group_cols</span><span class="p">):</span>
<span class="gp">   ....: </span>        <span class="bp">self</span><span class="o">.</span><span class="n">my_df</span> <span class="o">=</span> <span class="n">my_df</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span>
<span class="gp">   ....: </span>        <span class="bp">self</span><span class="o">.</span><span class="n">groups</span> <span class="o">=</span> <span class="n">my_df</span><span class="p">[</span><span class="n">group_cols</span><span class="p">]</span><span class="o">.</span><span class="n">drop_duplicates</span><span class="p">()</span>
<span class="gp">   ....: </span>        <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">groups</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">):</span>
<span class="gp">   ....: </span>            <span class="bp">self</span><span class="o">.</span><span class="n">groups</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">groups</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span>
<span class="gp">   ....: </span>        <span class="bp">self</span><span class="o">.</span><span class="n">group_cols</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">groups</span><span class="o">.</span><span class="n">columns</span><span class="o">.</span><span class="n">tolist</span><span class="p">()</span>
<span class="gp">   ....: </span>        <span class="bp">self</span><span class="o">.</span><span class="n">groups</span> <span class="o">=</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="bp">self</span><span class="o">.</span><span class="n">groups</span><span class="p">[</span><span class="n">i</span><span class="p">]</span><span class="o">.</span><span class="n">values</span><span class="o">.</span><span class="n">tolist</span><span class="p">(</span>
<span class="gp">   ....: </span>                       <span class="p">)</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">groups</span><span class="o">.</span><span class="n">columns</span><span class="p">}</span>
<span class="gp">   ....: </span>        <span class="bp">self</span><span class="o">.</span><span class="n">transform_col</span> <span class="o">=</span> <span class="kc">None</span>
<span class="gp">   ....: </span>    <span class="k">def</span> <span class="fm">__getitem__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">col</span><span class="p">):</span>
<span class="gp">   ....: </span>        <span class="bp">self</span><span class="o">.</span><span class="n">pr_col</span> <span class="o">=</span> <span class="p">[</span><span class="n">col</span><span class="p">]</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span> <span class="k">else</span> <span class="nb">list</span><span class="p">(</span><span class="n">col</span><span class="p">)</span>
<span class="gp">   ....: </span>        <span class="k">return</span> <span class="bp">self</span>
<span class="gp">   ....: </span>    <span class="k">def</span> <span class="nf">transform</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">my_func</span><span class="p">):</span>
<span class="gp">   ....: </span>        <span class="bp">self</span><span class="o">.</span><span class="n">num</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">groups</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">group_cols</span><span class="p">[</span><span class="mi">0</span><span class="p">]])</span>
<span class="gp">   ....: </span>        <span class="n">L_order</span><span class="p">,</span> <span class="n">L_value</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">([]),</span> <span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">([])</span>
<span class="gp">   ....: </span>        <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">num</span><span class="p">):</span>
<span class="gp">   ....: </span>            <span class="n">group_df</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">my_df</span><span class="o">.</span><span class="n">reset_index</span><span class="p">()</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span>
<span class="gp">   ....: </span>            <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">group_cols</span><span class="p">:</span>
<span class="gp">   ....: </span>                <span class="n">group_df</span> <span class="o">=</span> <span class="n">group_df</span><span class="p">[</span><span class="n">group_df</span><span class="p">[</span><span class="n">col</span><span class="p">]</span><span class="o">==</span><span class="bp">self</span><span class="o">.</span><span class="n">groups</span><span class="p">[</span><span class="n">col</span><span class="p">][</span><span class="n">i</span><span class="p">]]</span>
<span class="gp">   ....: </span>            <span class="n">group_df</span> <span class="o">=</span> <span class="n">group_df</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">pr_col</span><span class="p">]</span>
<span class="gp">   ....: </span>            <span class="k">if</span> <span class="n">group_df</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span>
<span class="gp">   ....: </span>                <span class="n">group_df</span> <span class="o">=</span> <span class="n">group_df</span><span class="o">.</span><span class="n">iloc</span><span class="p">[:,</span> <span class="mi">0</span><span class="p">]</span>
<span class="gp">   ....: </span>            <span class="n">group_res</span> <span class="o">=</span> <span class="n">my_func</span><span class="p">(</span><span class="n">group_df</span><span class="p">)</span>
<span class="gp">   ....: </span>            <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">group_res</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">):</span>
<span class="gp">   ....: </span>                <span class="n">group_res</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="n">group_res</span><span class="p">,</span>
<span class="gp">   ....: </span>                                      <span class="n">index</span><span class="o">=</span><span class="n">group_df</span><span class="o">.</span><span class="n">index</span><span class="p">,</span>
<span class="gp">   ....: </span>                                      <span class="n">name</span><span class="o">=</span><span class="n">group_df</span><span class="o">.</span><span class="n">name</span><span class="p">)</span>
<span class="gp">   ....: </span>            <span class="n">L_order</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">r_</span><span class="p">[</span><span class="n">L_order</span><span class="p">,</span> <span class="n">group_res</span><span class="o">.</span><span class="n">index</span><span class="p">]</span>
<span class="gp">   ....: </span>            <span class="n">L_value</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">r_</span><span class="p">[</span><span class="n">L_value</span><span class="p">,</span> <span class="n">group_res</span><span class="o">.</span><span class="n">values</span><span class="p">]</span>
<span class="gp">   ....: </span>        <span class="bp">self</span><span class="o">.</span><span class="n">res</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="n">L_value</span><span class="p">,</span> <span class="n">index</span><span class="o">=</span><span class="n">L_order</span><span class="p">)</span><span class="o">.</span><span class="n">sort_index</span><span class="p">(</span>
<span class="gp">   ....: </span>                   <span class="p">)</span><span class="o">.</span><span class="n">values</span><span class="p">,</span><span class="n">index</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">my_df</span><span class="o">.</span><span class="n">reset_index</span><span class="p">(</span>
<span class="gp">   ....: </span>                   <span class="p">)</span><span class="o">.</span><span class="n">index</span><span class="p">,</span> <span class="n">name</span><span class="o">=</span><span class="n">my_func</span><span class="o">.</span><span class="vm">__name__</span><span class="p">)</span>
<span class="gp">   ....: </span>        <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">res</span>
<span class="gp">   ....: </span>

<span class="gp">In [87]: </span><span class="n">my_groupby</span><span class="p">(</span><span class="n">df</span><span class="p">,</span> <span class="s1">&#39;Type&#39;</span><span class="p">)</span>
<span class="gh">Out[87]: </span><span class="go">&lt;__main__.my_groupby at 0x29238dedc10&gt;</span>
</pre></div>
</div>
<p>单列分组：</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [88]: </span><span class="k">def</span> <span class="nf">f</span><span class="p">(</span><span class="n">s</span><span class="p">):</span>
<span class="gp">   ....: </span>    <span class="n">res</span> <span class="o">=</span> <span class="p">(</span><span class="n">s</span><span class="o">-</span><span class="n">s</span><span class="o">.</span><span class="n">min</span><span class="p">())</span><span class="o">/</span><span class="p">(</span><span class="n">s</span><span class="o">.</span><span class="n">max</span><span class="p">()</span><span class="o">-</span><span class="n">s</span><span class="o">.</span><span class="n">min</span><span class="p">())</span>
<span class="gp">   ....: </span>    <span class="k">return</span> <span class="n">res</span>
<span class="gp">   ....: </span>

<span class="gp">In [89]: </span><span class="n">my_groupby</span><span class="p">(</span><span class="n">df</span><span class="p">,</span> <span class="s1">&#39;Type&#39;</span><span class="p">)[</span><span class="s1">&#39;Price&#39;</span><span class="p">]</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">f</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
<span class="gh">Out[89]: </span>
<span class="go">0    0.733592</span>
<span class="go">1    0.372003</span>
<span class="go">2    0.109712</span>
<span class="go">3    0.186244</span>
<span class="go">4    0.177525</span>
<span class="go">Name: f, dtype: float64</span>

<span class="gp">In [90]: </span><span class="n">df</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="s1">&#39;Type&#39;</span><span class="p">)[</span><span class="s1">&#39;Price&#39;</span><span class="p">]</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">f</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
<span class="gh">Out[90]: </span>
<span class="go">0    0.733592</span>
<span class="go">1    0.372003</span>
<span class="go">2    0.109712</span>
<span class="go">3    0.186244</span>
<span class="go">4    0.177525</span>
<span class="go">Name: Price, dtype: float64</span>
</pre></div>
</div>
<p>多列分组：</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [91]: </span><span class="n">my_groupby</span><span class="p">(</span><span class="n">df</span><span class="p">,</span> <span class="p">[</span><span class="s1">&#39;Type&#39;</span><span class="p">,</span><span class="s1">&#39;Country&#39;</span><span class="p">])[</span><span class="s1">&#39;Price&#39;</span><span class="p">]</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">f</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
<span class="gh">Out[91]: </span>
<span class="go">0    1.000000</span>
<span class="go">1    0.000000</span>
<span class="go">2    0.000000</span>
<span class="go">3    0.000000</span>
<span class="go">4    0.196357</span>
<span class="go">Name: f, dtype: float64</span>

<span class="gp">In [92]: </span><span class="n">df</span><span class="o">.</span><span class="n">groupby</span><span class="p">([</span><span class="s1">&#39;Type&#39;</span><span class="p">,</span><span class="s1">&#39;Country&#39;</span><span class="p">])[</span><span class="s1">&#39;Price&#39;</span><span class="p">]</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">f</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
<span class="gh">Out[92]: </span>
<span class="go">0    1.000000</span>
<span class="go">1    0.000000</span>
<span class="go">2    0.000000</span>
<span class="go">3    0.000000</span>
<span class="go">4    0.196357</span>
<span class="go">Name: Price, dtype: float64</span>
</pre></div>
</div>
<p>标量广播：</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [93]: </span><span class="n">my_groupby</span><span class="p">(</span><span class="n">df</span><span class="p">,</span> <span class="s1">&#39;Type&#39;</span><span class="p">)[</span><span class="s1">&#39;Price&#39;</span><span class="p">]</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span><span class="n">x</span><span class="o">.</span><span class="n">mean</span><span class="p">())</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
<span class="gh">Out[93]: </span>
<span class="go">0    7682.384615</span>
<span class="go">1    7682.384615</span>
<span class="go">2    7682.384615</span>
<span class="go">3    7682.384615</span>
<span class="go">4    7682.384615</span>
<span class="go">Name: &lt;lambda&gt;, dtype: float64</span>

<span class="gp">In [94]: </span><span class="n">df</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="s1">&#39;Type&#39;</span><span class="p">)[</span><span class="s1">&#39;Price&#39;</span><span class="p">]</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span><span class="n">x</span><span class="o">.</span><span class="n">mean</span><span class="p">())</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
<span class="gh">Out[94]: </span>
<span class="go">0    7682.384615</span>
<span class="go">1    7682.384615</span>
<span class="go">2    7682.384615</span>
<span class="go">3    7682.384615</span>
<span class="go">4    7682.384615</span>
<span class="go">Name: Price, dtype: float64</span>
</pre></div>
</div>
<p>跨列计算：</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [95]: </span><span class="n">my_groupby</span><span class="p">(</span><span class="n">df</span><span class="p">,</span> <span class="s1">&#39;Type&#39;</span><span class="p">)[</span><span class="s1">&#39;Disp.&#39;</span><span class="p">,</span> <span class="s1">&#39;HP&#39;</span><span class="p">]</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span>
<span class="gp">   ....: </span>               <span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="p">[</span><span class="s1">&#39;Disp.&#39;</span><span class="p">]</span><span class="o">/</span><span class="n">x</span><span class="o">.</span><span class="n">HP</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
<span class="gp">   ....: </span>
<span class="gh">Out[95]: </span>
<span class="go">0    0.858407</span>
<span class="go">1    1.266667</span>
<span class="go">2    1.285714</span>
<span class="go">3    0.989130</span>
<span class="go">4    1.097087</span>
<span class="go">Name: &lt;lambda&gt;, dtype: float64</span>
</pre></div>
</div>
</section>
</section>
<section id="id10">
<h2>第五章 变形<a class="headerlink" href="#id10" title="Permalink to this heading">#</a></h2>
<section id="id11">
<h3>Ex1：美国非法药物数据集<a class="headerlink" href="#id11" title="Permalink to this heading">#</a></h3>
<ol class="arabic simple">
<li></li>
</ol>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [96]: </span><span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s1">&#39;data/drugs.csv&#39;</span><span class="p">)</span><span class="o">.</span><span class="n">sort_values</span><span class="p">([</span>
<span class="gp">   ....: </span>     <span class="s1">&#39;State&#39;</span><span class="p">,</span><span class="s1">&#39;COUNTY&#39;</span><span class="p">,</span><span class="s1">&#39;SubstanceName&#39;</span><span class="p">],</span><span class="n">ignore_index</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="gp">   ....: </span>

<span class="gp">In [97]: </span><span class="n">res</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">pivot</span><span class="p">(</span><span class="n">index</span><span class="o">=</span><span class="p">[</span><span class="s1">&#39;State&#39;</span><span class="p">,</span><span class="s1">&#39;COUNTY&#39;</span><span class="p">,</span><span class="s1">&#39;SubstanceName&#39;</span>
<span class="gp">   ....: </span>              <span class="p">],</span> <span class="n">columns</span><span class="o">=</span><span class="s1">&#39;YYYY&#39;</span><span class="p">,</span> <span class="n">values</span><span class="o">=</span><span class="s1">&#39;DrugReports&#39;</span>
<span class="gp">   ....: </span>              <span class="p">)</span><span class="o">.</span><span class="n">reset_index</span><span class="p">()</span><span class="o">.</span><span class="n">rename_axis</span><span class="p">(</span><span class="n">columns</span><span class="o">=</span><span class="p">{</span><span class="s1">&#39;YYYY&#39;</span><span class="p">:</span><span class="s1">&#39;&#39;</span><span class="p">})</span>
<span class="gp">   ....: </span>

<span class="gp">In [98]: </span><span class="n">res</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="mi">5</span><span class="p">)</span>
<span class="gh">Out[98]: </span>
<span class="go">  State COUNTY  SubstanceName  2010  2011  2012  2013  2014  2015  2016  2017</span>
<span class="go">0    KY  ADAIR  Buprenorphine   NaN   3.0   5.0   4.0  27.0   5.0   7.0  10.0</span>
<span class="go">1    KY  ADAIR        Codeine   NaN   NaN   1.0   NaN   NaN   NaN   NaN   1.0</span>
<span class="go">2    KY  ADAIR       Fentanyl   NaN   NaN   1.0   NaN   NaN   NaN   NaN   NaN</span>
<span class="go">3    KY  ADAIR         Heroin   NaN   NaN   1.0   2.0   NaN   1.0   NaN   2.0</span>
<span class="go">4    KY  ADAIR    Hydrocodone   6.0   9.0  10.0  10.0   9.0   7.0  11.0   3.0</span>
</pre></div>
</div>
<ol class="arabic simple" start="2">
<li></li>
</ol>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [99]: </span><span class="n">res_melted</span> <span class="o">=</span> <span class="n">res</span><span class="o">.</span><span class="n">melt</span><span class="p">(</span><span class="n">id_vars</span> <span class="o">=</span> <span class="p">[</span><span class="s1">&#39;State&#39;</span><span class="p">,</span><span class="s1">&#39;COUNTY&#39;</span><span class="p">,</span><span class="s1">&#39;SubstanceName&#39;</span><span class="p">],</span>
<span class="gp">   ....: </span>                     <span class="n">value_vars</span> <span class="o">=</span> <span class="n">res</span><span class="o">.</span><span class="n">columns</span><span class="p">[</span><span class="o">-</span><span class="mi">8</span><span class="p">:],</span>
<span class="gp">   ....: </span>                     <span class="n">var_name</span> <span class="o">=</span> <span class="s1">&#39;YYYY&#39;</span><span class="p">,</span>
<span class="gp">   ....: </span>                     <span class="n">value_name</span> <span class="o">=</span> <span class="s1">&#39;DrugReports&#39;</span><span class="p">)</span><span class="o">.</span><span class="n">dropna</span><span class="p">(</span>
<span class="gp">   ....: </span>                     <span class="n">subset</span><span class="o">=</span><span class="p">[</span><span class="s1">&#39;DrugReports&#39;</span><span class="p">])</span>
<span class="gp">   ....: </span>

<span class="gp">In [100]: </span><span class="n">res_melted</span> <span class="o">=</span> <span class="n">res_melted</span><span class="p">[</span><span class="n">df</span><span class="o">.</span><span class="n">columns</span><span class="p">]</span><span class="o">.</span><span class="n">sort_values</span><span class="p">([</span>
<span class="gp">   .....: </span>              <span class="s1">&#39;State&#39;</span><span class="p">,</span><span class="s1">&#39;COUNTY&#39;</span><span class="p">,</span><span class="s1">&#39;SubstanceName&#39;</span><span class="p">],</span><span class="n">ignore_index</span><span class="o">=</span><span class="kc">True</span>
<span class="gp">   .....: </span>              <span class="p">)</span><span class="o">.</span><span class="n">astype</span><span class="p">({</span><span class="s1">&#39;YYYY&#39;</span><span class="p">:</span><span class="s1">&#39;int64&#39;</span><span class="p">,</span> <span class="s1">&#39;DrugReports&#39;</span><span class="p">:</span><span class="s1">&#39;int64&#39;</span><span class="p">})</span>
<span class="gp">   .....: </span><span class="n">res_melted</span><span class="o">.</span><span class="n">equals</span><span class="p">(</span><span class="n">df</span><span class="p">)</span>
<span class="gp">   .....: </span>
<span class="gh">Out[100]: </span><span class="go">True</span>
</pre></div>
</div>
<ol class="arabic simple" start="3">
<li></li>
</ol>
<p>策略一：</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [101]: </span><span class="n">res</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">pivot_table</span><span class="p">(</span><span class="n">index</span><span class="o">=</span><span class="s1">&#39;YYYY&#39;</span><span class="p">,</span> <span class="n">columns</span><span class="o">=</span><span class="s1">&#39;State&#39;</span><span class="p">,</span>
<span class="gp">   .....: </span>                     <span class="n">values</span><span class="o">=</span><span class="s1">&#39;DrugReports&#39;</span><span class="p">,</span> <span class="n">aggfunc</span><span class="o">=</span><span class="s1">&#39;sum&#39;</span><span class="p">)</span>
<span class="gp">   .....: </span>

<span class="gp">In [102]: </span><span class="n">res</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="mi">3</span><span class="p">)</span>
<span class="gh">Out[102]: </span>
<span class="go">State     KY     OH     PA    VA    WV</span>
<span class="go">YYYY                                  </span>
<span class="go">2010   10453  19707  19814  8685  2890</span>
<span class="go">2011   10289  20330  19987  6749  3271</span>
<span class="go">2012   10722  23145  19959  7831  3376</span>
</pre></div>
</div>
<p>策略二：</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [103]: </span><span class="n">res</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">groupby</span><span class="p">([</span><span class="s1">&#39;State&#39;</span><span class="p">,</span> <span class="s1">&#39;YYYY&#39;</span><span class="p">])[</span><span class="s1">&#39;DrugReports&#39;</span><span class="p">]</span><span class="o">.</span><span class="n">sum</span><span class="p">(</span>
<span class="gp">   .....: </span>                <span class="p">)</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span><span class="o">.</span><span class="n">unstack</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span><span class="o">.</span><span class="n">droplevel</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="n">axis</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
<span class="gp">   .....: </span>

<span class="gp">In [104]: </span><span class="n">res</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="mi">3</span><span class="p">)</span>
<span class="gh">Out[104]: </span>
<span class="go">State     KY     OH     PA    VA    WV</span>
<span class="go">YYYY                                  </span>
<span class="go">2010   10453  19707  19814  8685  2890</span>
<span class="go">2011   10289  20330  19987  6749  3271</span>
<span class="go">2012   10722  23145  19959  7831  3376</span>
</pre></div>
</div>
</section>
<section id="ex2-wide-to-long">
<h3>Ex2：特殊的wide_to_long方法<a class="headerlink" href="#ex2-wide-to-long" title="Permalink to this heading">#</a></h3>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [105]: </span><span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span><span class="s1">&#39;Class&#39;</span><span class="p">:[</span><span class="mi">1</span><span class="p">,</span><span class="mi">2</span><span class="p">],</span>
<span class="gp">   .....: </span>                  <span class="s1">&#39;Name&#39;</span><span class="p">:[</span><span class="s1">&#39;San Zhang&#39;</span><span class="p">,</span> <span class="s1">&#39;Si Li&#39;</span><span class="p">],</span>
<span class="gp">   .....: </span>                  <span class="s1">&#39;Chinese&#39;</span><span class="p">:[</span><span class="mi">80</span><span class="p">,</span> <span class="mi">90</span><span class="p">],</span>
<span class="gp">   .....: </span>                  <span class="s1">&#39;Math&#39;</span><span class="p">:[</span><span class="mi">80</span><span class="p">,</span> <span class="mi">75</span><span class="p">]})</span>
<span class="gp">   .....: </span>

<span class="gp">In [106]: </span><span class="n">df</span>
<span class="gh">Out[106]: </span>
<span class="go">   Class       Name  Chinese  Math</span>
<span class="go">0      1  San Zhang       80    80</span>
<span class="go">1      2      Si Li       90    75</span>
</pre></div>
</div>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [107]: </span><span class="n">df</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">columns</span><span class="o">=</span><span class="p">{</span><span class="s1">&#39;Chinese&#39;</span><span class="p">:</span><span class="s1">&#39;pre_Chinese&#39;</span><span class="p">,</span> <span class="s1">&#39;Math&#39;</span><span class="p">:</span><span class="s1">&#39;pre_Math&#39;</span><span class="p">})</span>

<span class="gp">In [108]: </span><span class="n">pd</span><span class="o">.</span><span class="n">wide_to_long</span><span class="p">(</span><span class="n">df</span><span class="p">,</span>
<span class="gp">   .....: </span>                <span class="n">stubnames</span><span class="o">=</span><span class="p">[</span><span class="s1">&#39;pre&#39;</span><span class="p">],</span>
<span class="gp">   .....: </span>                <span class="n">i</span> <span class="o">=</span> <span class="p">[</span><span class="s1">&#39;Class&#39;</span><span class="p">,</span> <span class="s1">&#39;Name&#39;</span><span class="p">],</span>
<span class="gp">   .....: </span>                <span class="n">j</span><span class="o">=</span><span class="s1">&#39;Subject&#39;</span><span class="p">,</span>
<span class="gp">   .....: </span>                <span class="n">sep</span><span class="o">=</span><span class="s1">&#39;_&#39;</span><span class="p">,</span>
<span class="gp">   .....: </span>                <span class="n">suffix</span><span class="o">=</span><span class="s1">&#39;.+&#39;</span><span class="p">)</span><span class="o">.</span><span class="n">reset_index</span><span class="p">()</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">columns</span><span class="o">=</span><span class="p">{</span><span class="s1">&#39;pre&#39;</span><span class="p">:</span><span class="s1">&#39;Grade&#39;</span><span class="p">})</span>
<span class="gp">   .....: </span>
<span class="gh">Out[108]: </span>
<span class="go">   Class       Name  Subject  Grade</span>
<span class="go">0      1  San Zhang  Chinese     80</span>
<span class="go">1      1  San Zhang     Math     80</span>
<span class="go">2      2      Si Li  Chinese     90</span>
<span class="go">3      2      Si Li     Math     75</span>
</pre></div>
</div>
</section>
</section>
<section id="id12">
<h2>第六章 连接<a class="headerlink" href="#id12" title="Permalink to this heading">#</a></h2>
<section id="id13">
<h3>Ex1：美国疫情数据集<a class="headerlink" href="#id13" title="Permalink to this heading">#</a></h3>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [109]: </span><span class="n">date</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">date_range</span><span class="p">(</span><span class="s1">&#39;20200412&#39;</span><span class="p">,</span> <span class="s1">&#39;20201116&#39;</span><span class="p">)</span><span class="o">.</span><span class="n">to_series</span><span class="p">()</span>

<span class="gp">In [110]: </span><span class="n">date</span> <span class="o">=</span> <span class="n">date</span><span class="o">.</span><span class="n">dt</span><span class="o">.</span><span class="n">month</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="s1">&#39;string&#39;</span><span class="p">)</span><span class="o">.</span><span class="n">str</span><span class="o">.</span><span class="n">zfill</span><span class="p">(</span><span class="mi">2</span>
<span class="gp">   .....: </span>       <span class="p">)</span> <span class="o">+</span><span class="s1">&#39;-&#39;</span><span class="o">+</span> <span class="n">date</span><span class="o">.</span><span class="n">dt</span><span class="o">.</span><span class="n">day</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="s1">&#39;string&#39;</span>
<span class="gp">   .....: </span>       <span class="p">)</span><span class="o">.</span><span class="n">str</span><span class="o">.</span><span class="n">zfill</span><span class="p">(</span><span class="mi">2</span><span class="p">)</span> <span class="o">+</span><span class="s1">&#39;-&#39;</span><span class="o">+</span> <span class="s1">&#39;2020&#39;</span>
<span class="gp">   .....: </span>

<span class="gp">In [111]: </span><span class="n">date</span> <span class="o">=</span> <span class="n">date</span><span class="o">.</span><span class="n">tolist</span><span class="p">()</span>
</pre></div>
</div>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [112]: </span><span class="n">L</span> <span class="o">=</span> <span class="p">[]</span>

<span class="gp">In [113]: </span><span class="k">for</span> <span class="n">d</span> <span class="ow">in</span> <span class="n">date</span><span class="p">:</span>
<span class="gp">   .....: </span>    <span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s1">&#39;data/us_report/&#39;</span> <span class="o">+</span> <span class="n">d</span> <span class="o">+</span> <span class="s1">&#39;.csv&#39;</span><span class="p">,</span> <span class="n">index_col</span><span class="o">=</span><span class="s1">&#39;Province_State&#39;</span><span class="p">)</span>
<span class="gp">   .....: </span>    <span class="n">data</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">loc</span><span class="p">[</span><span class="s1">&#39;New York&#39;</span><span class="p">,</span> <span class="p">[</span><span class="s1">&#39;Confirmed&#39;</span><span class="p">,</span><span class="s1">&#39;Deaths&#39;</span><span class="p">,</span>
<span class="gp">   .....: </span>                  <span class="s1">&#39;Recovered&#39;</span><span class="p">,</span><span class="s1">&#39;Active&#39;</span><span class="p">]]</span>
<span class="gp">   .....: </span>    <span class="n">L</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">data</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span><span class="o">.</span><span class="n">T</span><span class="p">)</span>
<span class="gp">   .....: </span>

<span class="gp">In [114]: </span><span class="n">res</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">concat</span><span class="p">(</span><span class="n">L</span><span class="p">)</span>

<span class="gp">In [115]: </span><span class="n">res</span><span class="o">.</span><span class="n">index</span> <span class="o">=</span> <span class="n">date</span>

<span class="gp">In [116]: </span><span class="n">res</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
<span class="gh">Out[116]: </span>
<span class="go">           Confirmed Deaths Recovered    Active</span>
<span class="go">04-12-2020    189033   9385   23887.0    179648</span>
<span class="go">04-13-2020    195749  10058   23887.0  185691.0</span>
<span class="go">04-14-2020    203020  10842   23887.0  192178.0</span>
<span class="go">04-15-2020    214454  11617   23887.0  202837.0</span>
<span class="go">04-16-2020    223691  14832   23887.0  208859.0</span>
</pre></div>
</div>
</section>
<section id="ex2-join">
<h3>Ex2：实现join函数<a class="headerlink" href="#ex2-join" title="Permalink to this heading">#</a></h3>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [117]: </span><span class="k">def</span> <span class="nf">join</span><span class="p">(</span><span class="n">df1</span><span class="p">,</span> <span class="n">df2</span><span class="p">,</span> <span class="n">how</span><span class="o">=</span><span class="s1">&#39;left&#39;</span><span class="p">):</span>
<span class="gp">   .....: </span>    <span class="n">res_col</span> <span class="o">=</span> <span class="n">df1</span><span class="o">.</span><span class="n">columns</span><span class="o">.</span><span class="n">tolist</span><span class="p">()</span> <span class="o">+</span>  <span class="n">df2</span><span class="o">.</span><span class="n">columns</span><span class="o">.</span><span class="n">tolist</span><span class="p">()</span>
<span class="gp">   .....: </span>    <span class="n">dup</span> <span class="o">=</span> <span class="n">df1</span><span class="o">.</span><span class="n">index</span><span class="o">.</span><span class="n">unique</span><span class="p">()</span><span class="o">.</span><span class="n">intersection</span><span class="p">(</span><span class="n">df2</span><span class="o">.</span><span class="n">index</span><span class="o">.</span><span class="n">unique</span><span class="p">())</span>
<span class="gp">   .....: </span>    <span class="n">res_df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">columns</span> <span class="o">=</span> <span class="n">res_col</span><span class="p">)</span>
<span class="gp">   .....: </span>    <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">dup</span><span class="p">:</span>
<span class="gp">   .....: </span>        <span class="n">cartesian</span> <span class="o">=</span> <span class="p">[</span><span class="nb">list</span><span class="p">(</span><span class="n">i</span><span class="p">)</span><span class="o">+</span><span class="nb">list</span><span class="p">(</span><span class="n">j</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="n">df1</span><span class="o">.</span><span class="n">loc</span><span class="p">[</span><span class="n">label</span>
<span class="gp">   .....: </span>                    <span class="p">]</span><span class="o">.</span><span class="n">values</span><span class="o">.</span><span class="n">reshape</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span><span class="mi">1</span><span class="p">)</span> <span class="k">for</span> <span class="n">j</span> <span class="ow">in</span> <span class="n">df2</span><span class="o">.</span><span class="n">loc</span><span class="p">[</span>
<span class="gp">   .....: </span>                      <span class="n">label</span><span class="p">]</span><span class="o">.</span><span class="n">values</span><span class="o">.</span><span class="n">reshape</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span><span class="mi">1</span><span class="p">)]</span>
<span class="gp">   .....: </span>        <span class="n">dup_df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">cartesian</span><span class="p">,</span> <span class="n">index</span> <span class="o">=</span> <span class="p">[</span><span class="n">label</span><span class="p">]</span><span class="o">*</span><span class="nb">len</span><span class="p">(</span>
<span class="gp">   .....: </span>                 <span class="n">cartesian</span><span class="p">),</span> <span class="n">columns</span> <span class="o">=</span> <span class="n">res_col</span><span class="p">)</span>
<span class="gp">   .....: </span>        <span class="n">res_df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">concat</span><span class="p">([</span><span class="n">res_df</span><span class="p">,</span><span class="n">dup_df</span><span class="p">])</span>
<span class="gp">   .....: </span>    <span class="k">if</span> <span class="n">how</span> <span class="ow">in</span> <span class="p">[</span><span class="s1">&#39;left&#39;</span><span class="p">,</span> <span class="s1">&#39;outer&#39;</span><span class="p">]:</span>
<span class="gp">   .....: </span>        <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">df1</span><span class="o">.</span><span class="n">index</span><span class="o">.</span><span class="n">unique</span><span class="p">()</span><span class="o">.</span><span class="n">difference</span><span class="p">(</span><span class="n">dup</span><span class="p">):</span>
<span class="gp">   .....: </span>            <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">df1</span><span class="o">.</span><span class="n">loc</span><span class="p">[</span><span class="n">label</span><span class="p">],</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">):</span>
<span class="gp">   .....: </span>                <span class="n">cat</span> <span class="o">=</span> <span class="p">[</span><span class="nb">list</span><span class="p">(</span><span class="n">i</span><span class="p">)</span><span class="o">+</span><span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">nan</span><span class="p">]</span><span class="o">*</span><span class="n">df2</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span>
<span class="gp">   .....: </span>                      <span class="p">]</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="n">df1</span><span class="o">.</span><span class="n">loc</span><span class="p">[</span><span class="n">label</span><span class="p">]</span><span class="o">.</span><span class="n">values</span><span class="p">]</span>
<span class="gp">   .....: </span>            <span class="k">else</span><span class="p">:</span> <span class="n">cat</span> <span class="o">=</span> <span class="p">[</span><span class="nb">list</span><span class="p">(</span><span class="n">i</span><span class="p">)</span><span class="o">+</span><span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">nan</span><span class="p">]</span><span class="o">*</span><span class="n">df2</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span>
<span class="gp">   .....: </span>                      <span class="p">]</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="n">df1</span><span class="o">.</span><span class="n">loc</span><span class="p">[</span><span class="n">label</span><span class="p">]</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span><span class="o">.</span><span class="n">values</span><span class="p">]</span>
<span class="gp">   .....: </span>            <span class="n">dup_df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">cat</span><span class="p">,</span> <span class="n">index</span> <span class="o">=</span> <span class="p">[</span><span class="n">label</span>
<span class="gp">   .....: </span>                      <span class="p">]</span><span class="o">*</span><span class="nb">len</span><span class="p">(</span><span class="n">cat</span><span class="p">),</span> <span class="n">columns</span> <span class="o">=</span> <span class="n">res_col</span><span class="p">)</span>
<span class="gp">   .....: </span>            <span class="n">res_df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">concat</span><span class="p">([</span><span class="n">res_df</span><span class="p">,</span><span class="n">dup_df</span><span class="p">])</span>
<span class="gp">   .....: </span>    <span class="k">if</span> <span class="n">how</span> <span class="ow">in</span> <span class="p">[</span><span class="s1">&#39;right&#39;</span><span class="p">,</span> <span class="s1">&#39;outer&#39;</span><span class="p">]:</span>
<span class="gp">   .....: </span>        <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">df2</span><span class="o">.</span><span class="n">index</span><span class="o">.</span><span class="n">unique</span><span class="p">()</span><span class="o">.</span><span class="n">difference</span><span class="p">(</span><span class="n">dup</span><span class="p">):</span>
<span class="gp">   .....: </span>            <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">df2</span><span class="o">.</span><span class="n">loc</span><span class="p">[</span><span class="n">label</span><span class="p">],</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">):</span>
<span class="gp">   .....: </span>                <span class="n">cat</span> <span class="o">=</span> <span class="p">[[</span><span class="n">np</span><span class="o">.</span><span class="n">nan</span><span class="p">]</span><span class="o">+</span><span class="nb">list</span><span class="p">(</span><span class="n">i</span><span class="p">)</span><span class="o">*</span><span class="n">df1</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span>
<span class="gp">   .....: </span>                      <span class="p">]</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="n">df2</span><span class="o">.</span><span class="n">loc</span><span class="p">[</span><span class="n">label</span><span class="p">]</span><span class="o">.</span><span class="n">values</span><span class="p">]</span>
<span class="gp">   .....: </span>            <span class="k">else</span><span class="p">:</span> <span class="n">cat</span> <span class="o">=</span> <span class="p">[[</span><span class="n">np</span><span class="o">.</span><span class="n">nan</span><span class="p">]</span><span class="o">+</span><span class="nb">list</span><span class="p">(</span><span class="n">i</span><span class="p">)</span><span class="o">*</span><span class="n">df1</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span>
<span class="gp">   .....: </span>                      <span class="p">]</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="n">df2</span><span class="o">.</span><span class="n">loc</span><span class="p">[</span><span class="n">label</span><span class="p">]</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span><span class="o">.</span><span class="n">values</span><span class="p">]</span>
<span class="gp">   .....: </span>            <span class="n">dup_df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">cat</span><span class="p">,</span> <span class="n">index</span> <span class="o">=</span> <span class="p">[</span><span class="n">label</span>
<span class="gp">   .....: </span>                      <span class="p">]</span><span class="o">*</span><span class="nb">len</span><span class="p">(</span><span class="n">cat</span><span class="p">),</span> <span class="n">columns</span> <span class="o">=</span> <span class="n">res_col</span><span class="p">)</span>
<span class="gp">   .....: </span>            <span class="n">res_df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">concat</span><span class="p">([</span><span class="n">res_df</span><span class="p">,</span><span class="n">dup_df</span><span class="p">])</span>
<span class="gp">   .....: </span>    <span class="k">return</span> <span class="n">res_df</span>
<span class="gp">   .....: </span>

<span class="gp">In [118]: </span><span class="n">df1</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span><span class="s1">&#39;col1&#39;</span><span class="p">:[</span><span class="mi">1</span><span class="p">,</span><span class="mi">2</span><span class="p">,</span><span class="mi">3</span><span class="p">,</span><span class="mi">4</span><span class="p">,</span><span class="mi">5</span><span class="p">]},</span> <span class="n">index</span><span class="o">=</span><span class="nb">list</span><span class="p">(</span><span class="s1">&#39;AABCD&#39;</span><span class="p">))</span>

<span class="gp">In [119]: </span><span class="n">df1</span>
<span class="gh">Out[119]: </span>
<span class="go">   col1</span>
<span class="go">A     1</span>
<span class="go">A     2</span>
<span class="go">B     3</span>
<span class="go">C     4</span>
<span class="go">D     5</span>

<span class="gp">In [120]: </span><span class="n">df2</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span><span class="s1">&#39;col2&#39;</span><span class="p">:</span><span class="nb">list</span><span class="p">(</span><span class="s1">&#39;opqrst&#39;</span><span class="p">)},</span> <span class="n">index</span><span class="o">=</span><span class="nb">list</span><span class="p">(</span><span class="s1">&#39;ABBCEE&#39;</span><span class="p">))</span>

<span class="gp">In [121]: </span><span class="n">df2</span>
<span class="gh">Out[121]: </span>
<span class="go">  col2</span>
<span class="go">A    o</span>
<span class="go">B    p</span>
<span class="go">B    q</span>
<span class="go">C    r</span>
<span class="go">E    s</span>
<span class="go">E    t</span>

<span class="gp">In [122]: </span><span class="n">join</span><span class="p">(</span><span class="n">df1</span><span class="p">,</span> <span class="n">df2</span><span class="p">,</span> <span class="n">how</span><span class="o">=</span><span class="s1">&#39;outer&#39;</span><span class="p">)</span>
<span class="gh">Out[122]: </span>
<span class="go">  col1 col2</span>
<span class="go">A    1    o</span>
<span class="go">A    2    o</span>
<span class="go">B    3    p</span>
<span class="go">B    3    q</span>
<span class="go">C    4    r</span>
<span class="go">D    5  NaN</span>
<span class="go">E  NaN    s</span>
<span class="go">E  NaN    t</span>
</pre></div>
</div>
</section>
</section>
<section id="id14">
<h2>第七章 缺失数据<a class="headerlink" href="#id14" title="Permalink to this heading">#</a></h2>
<section id="id15">
<h3>Ex1：缺失值与类别的相关性检验<a class="headerlink" href="#id15" title="Permalink to this heading">#</a></h3>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [123]: </span><span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s1">&#39;data/missing_chi.csv&#39;</span><span class="p">)</span>

<span class="gp">In [124]: </span><span class="n">cat_1</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">X_1</span><span class="o">.</span><span class="n">fillna</span><span class="p">(</span><span class="s1">&#39;NaN&#39;</span><span class="p">)</span><span class="o">.</span><span class="n">mask</span><span class="p">(</span><span class="n">df</span><span class="o">.</span><span class="n">X_1</span><span class="o">.</span><span class="n">notna</span><span class="p">())</span><span class="o">.</span><span class="n">fillna</span><span class="p">(</span><span class="s2">&quot;NotNaN&quot;</span><span class="p">)</span>

<span class="gp">In [125]: </span><span class="n">cat_2</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">X_2</span><span class="o">.</span><span class="n">fillna</span><span class="p">(</span><span class="s1">&#39;NaN&#39;</span><span class="p">)</span><span class="o">.</span><span class="n">mask</span><span class="p">(</span><span class="n">df</span><span class="o">.</span><span class="n">X_2</span><span class="o">.</span><span class="n">notna</span><span class="p">())</span><span class="o">.</span><span class="n">fillna</span><span class="p">(</span><span class="s2">&quot;NotNaN&quot;</span><span class="p">)</span>

<span class="gp">In [126]: </span><span class="n">df_1</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">crosstab</span><span class="p">(</span><span class="n">cat_1</span><span class="p">,</span> <span class="n">df</span><span class="o">.</span><span class="n">y</span><span class="p">,</span> <span class="n">margins</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>

<span class="gp">In [127]: </span><span class="n">df_2</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">crosstab</span><span class="p">(</span><span class="n">cat_2</span><span class="p">,</span> <span class="n">df</span><span class="o">.</span><span class="n">y</span><span class="p">,</span> <span class="n">margins</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>

<span class="gp">In [128]: </span><span class="k">def</span> <span class="nf">compute_S</span><span class="p">(</span><span class="n">my_df</span><span class="p">):</span>
<span class="gp">   .....: </span>    <span class="n">S</span> <span class="o">=</span> <span class="p">[]</span>
<span class="gp">   .....: </span>    <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">2</span><span class="p">):</span>
<span class="gp">   .....: </span>        <span class="k">for</span> <span class="n">j</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">2</span><span class="p">):</span>
<span class="gp">   .....: </span>            <span class="n">E</span> <span class="o">=</span> <span class="n">my_df</span><span class="o">.</span><span class="n">iat</span><span class="p">[</span><span class="n">i</span><span class="p">,</span> <span class="n">j</span><span class="p">]</span>
<span class="gp">   .....: </span>            <span class="n">F</span> <span class="o">=</span> <span class="n">my_df</span><span class="o">.</span><span class="n">iat</span><span class="p">[</span><span class="n">i</span><span class="p">,</span> <span class="mi">2</span><span class="p">]</span><span class="o">*</span><span class="n">my_df</span><span class="o">.</span><span class="n">iat</span><span class="p">[</span><span class="mi">2</span><span class="p">,</span> <span class="n">j</span><span class="p">]</span><span class="o">/</span><span class="n">my_df</span><span class="o">.</span><span class="n">iat</span><span class="p">[</span><span class="mi">2</span><span class="p">,</span><span class="mi">2</span><span class="p">]</span>
<span class="gp">   .....: </span>            <span class="n">S</span><span class="o">.</span><span class="n">append</span><span class="p">((</span><span class="n">E</span><span class="o">-</span><span class="n">F</span><span class="p">)</span><span class="o">**</span><span class="mi">2</span><span class="o">/</span><span class="n">F</span><span class="p">)</span>
<span class="gp">   .....: </span>    <span class="k">return</span> <span class="nb">sum</span><span class="p">(</span><span class="n">S</span><span class="p">)</span>
<span class="gp">   .....: </span>

<span class="gp">In [129]: </span><span class="n">res1</span> <span class="o">=</span> <span class="n">compute_S</span><span class="p">(</span><span class="n">df_1</span><span class="p">)</span>

<span class="gp">In [130]: </span><span class="n">res2</span> <span class="o">=</span> <span class="n">compute_S</span><span class="p">(</span><span class="n">df_2</span><span class="p">)</span>

<span class="gp">In [131]: </span><span class="kn">from</span> <span class="nn">scipy.stats</span> <span class="kn">import</span> <span class="n">chi2</span>

<span class="gp">In [132]: </span><span class="n">chi2</span><span class="o">.</span><span class="n">sf</span><span class="p">(</span><span class="n">res1</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> <span class="c1"># X_1检验的p值 # 不能认为相关，剔除</span>
<span class="gh">Out[132]: </span><span class="go">0.9712760884395901</span>

<span class="gp">In [133]: </span><span class="n">chi2</span><span class="o">.</span><span class="n">sf</span><span class="p">(</span><span class="n">res2</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> <span class="c1"># X_2检验的p值 # 认为相关，保留</span>
<span class="gh">Out[133]: </span><span class="go">7.459641265637543e-166</span>
</pre></div>
</div>
<p>结果与 <code class="docutils literal notranslate"><span class="pre">scipy.stats.chi2_contingency</span></code> 在不使用 <span class="math notranslate nohighlight">\(Yates\)</span> 修正的情况下完全一致：</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [134]: </span><span class="kn">from</span> <span class="nn">scipy.stats</span> <span class="kn">import</span> <span class="n">chi2_contingency</span>

<span class="gp">In [135]: </span><span class="n">chi2_contingency</span><span class="p">(</span><span class="n">pd</span><span class="o">.</span><span class="n">crosstab</span><span class="p">(</span><span class="n">cat_1</span><span class="p">,</span> <span class="n">df</span><span class="o">.</span><span class="n">y</span><span class="p">),</span> <span class="n">correction</span><span class="o">=</span><span class="kc">False</span><span class="p">)[</span><span class="mi">1</span><span class="p">]</span>
<span class="gh">Out[135]: </span><span class="go">0.9712760884395901</span>

<span class="gp">In [136]: </span><span class="n">chi2_contingency</span><span class="p">(</span><span class="n">pd</span><span class="o">.</span><span class="n">crosstab</span><span class="p">(</span><span class="n">cat_2</span><span class="p">,</span> <span class="n">df</span><span class="o">.</span><span class="n">y</span><span class="p">),</span> <span class="n">correction</span><span class="o">=</span><span class="kc">False</span><span class="p">)[</span><span class="mi">1</span><span class="p">]</span>
<span class="gh">Out[136]: </span><span class="go">7.459641265637543e-166</span>
</pre></div>
</div>
</section>
<section id="id16">
<h3>Ex2：用回归模型解决分类问题<a class="headerlink" href="#id16" title="Permalink to this heading">#</a></h3>
<ol class="arabic simple">
<li></li>
</ol>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [137]: </span><span class="kn">from</span> <span class="nn">sklearn.neighbors</span> <span class="kn">import</span> <span class="n">KNeighborsRegressor</span>

<span class="gp">In [138]: </span><span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_excel</span><span class="p">(</span><span class="s1">&#39;data/color.xlsx&#39;</span><span class="p">)</span>

<span class="gp">In [139]: </span><span class="n">df_dummies</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">get_dummies</span><span class="p">(</span><span class="n">df</span><span class="o">.</span><span class="n">Color</span><span class="p">)</span>

<span class="gp">In [140]: </span><span class="n">stack_list</span> <span class="o">=</span> <span class="p">[]</span>

<span class="gp">In [141]: </span><span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">df_dummies</span><span class="o">.</span><span class="n">columns</span><span class="p">:</span>
<span class="gp">   .....: </span>    <span class="n">clf</span> <span class="o">=</span> <span class="n">KNeighborsRegressor</span><span class="p">(</span><span class="n">n_neighbors</span><span class="o">=</span><span class="mi">6</span><span class="p">)</span>
<span class="gp">   .....: </span>    <span class="n">clf</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">df</span><span class="o">.</span><span class="n">iloc</span><span class="p">[:,:</span><span class="mi">2</span><span class="p">]</span><span class="o">.</span><span class="n">values</span><span class="p">,</span> <span class="n">df_dummies</span><span class="p">[</span><span class="n">col</span><span class="p">]</span><span class="o">.</span><span class="n">values</span><span class="p">)</span>
<span class="gp">   .....: </span>    <span class="n">res</span> <span class="o">=</span> <span class="n">clf</span><span class="o">.</span><span class="n">predict</span><span class="p">([[</span><span class="mf">0.8</span><span class="p">,</span> <span class="o">-</span><span class="mf">0.2</span><span class="p">]])</span><span class="o">.</span><span class="n">reshape</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span><span class="mi">1</span><span class="p">)</span>
<span class="gp">   .....: </span>    <span class="n">stack_list</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">res</span><span class="p">)</span>
<span class="gp">   .....: </span>

<span class="gp">In [142]: </span><span class="n">code_res</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">hstack</span><span class="p">(</span><span class="n">stack_list</span><span class="p">)</span><span class="o">.</span><span class="n">argmax</span><span class="p">(</span><span class="mi">1</span><span class="p">))</span>

<span class="gp">In [143]: </span><span class="n">df_dummies</span><span class="o">.</span><span class="n">columns</span><span class="p">[</span><span class="n">code_res</span><span class="p">[</span><span class="mi">0</span><span class="p">]]</span>
<span class="gh">Out[143]: </span><span class="go">&#39;Yellow&#39;</span>
</pre></div>
</div>
<ol class="arabic simple" start="2">
<li></li>
</ol>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [144]: </span><span class="kn">from</span> <span class="nn">sklearn.neighbors</span> <span class="kn">import</span> <span class="n">KNeighborsRegressor</span>

<span class="gp">In [145]: </span><span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s1">&#39;data/audit.csv&#39;</span><span class="p">)</span>

<span class="gp">In [146]: </span><span class="n">res_df</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span>

<span class="gp">In [147]: </span><span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">concat</span><span class="p">([</span><span class="n">pd</span><span class="o">.</span><span class="n">get_dummies</span><span class="p">(</span><span class="n">df</span><span class="p">[[</span><span class="s1">&#39;Marital&#39;</span><span class="p">,</span> <span class="s1">&#39;Gender&#39;</span><span class="p">]]),</span>
<span class="gp">   .....: </span>    <span class="n">df</span><span class="p">[[</span><span class="s1">&#39;Age&#39;</span><span class="p">,</span><span class="s1">&#39;Income&#39;</span><span class="p">,</span><span class="s1">&#39;Hours&#39;</span><span class="p">]]</span><span class="o">.</span><span class="n">apply</span><span class="p">(</span>
<span class="gp">   .....: </span>        <span class="k">lambda</span> <span class="n">x</span><span class="p">:(</span><span class="n">x</span><span class="o">-</span><span class="n">x</span><span class="o">.</span><span class="n">min</span><span class="p">())</span><span class="o">/</span><span class="p">(</span><span class="n">x</span><span class="o">.</span><span class="n">max</span><span class="p">()</span><span class="o">-</span><span class="n">x</span><span class="o">.</span><span class="n">min</span><span class="p">())),</span> <span class="n">df</span><span class="o">.</span><span class="n">Employment</span><span class="p">],</span><span class="mi">1</span><span class="p">)</span>
<span class="gp">   .....: </span>

<span class="gp">In [148]: </span><span class="n">X_train</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">query</span><span class="p">(</span><span class="s1">&#39;Employment.notna()&#39;</span><span class="p">)</span>

<span class="gp">In [149]: </span><span class="n">X_test</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">query</span><span class="p">(</span><span class="s1">&#39;Employment.isna()&#39;</span><span class="p">)</span>

<span class="gp">In [150]: </span><span class="n">df_dummies</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">get_dummies</span><span class="p">(</span><span class="n">X_train</span><span class="o">.</span><span class="n">Employment</span><span class="p">)</span>

<span class="gp">In [151]: </span><span class="n">stack_list</span> <span class="o">=</span> <span class="p">[]</span>

<span class="gp">In [152]: </span><span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">df_dummies</span><span class="o">.</span><span class="n">columns</span><span class="p">:</span>
<span class="gp">   .....: </span>    <span class="n">clf</span> <span class="o">=</span> <span class="n">KNeighborsRegressor</span><span class="p">(</span><span class="n">n_neighbors</span><span class="o">=</span><span class="mi">6</span><span class="p">)</span>
<span class="gp">   .....: </span>    <span class="n">clf</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">X_train</span><span class="o">.</span><span class="n">iloc</span><span class="p">[:,:</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">values</span><span class="p">,</span> <span class="n">df_dummies</span><span class="p">[</span><span class="n">col</span><span class="p">]</span><span class="o">.</span><span class="n">values</span><span class="p">)</span>
<span class="gp">   .....: </span>    <span class="n">res</span> <span class="o">=</span> <span class="n">clf</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">X_test</span><span class="o">.</span><span class="n">iloc</span><span class="p">[:,:</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">values</span><span class="p">)</span><span class="o">.</span><span class="n">reshape</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span><span class="mi">1</span><span class="p">)</span>
<span class="gp">   .....: </span>    <span class="n">stack_list</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">res</span><span class="p">)</span>
<span class="gp">   .....: </span>

<span class="gp">In [153]: </span><span class="n">code_res</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">hstack</span><span class="p">(</span><span class="n">stack_list</span><span class="p">)</span><span class="o">.</span><span class="n">argmax</span><span class="p">(</span><span class="mi">1</span><span class="p">))</span>

<span class="gp">In [154]: </span><span class="n">cat_res</span> <span class="o">=</span> <span class="n">code_res</span><span class="o">.</span><span class="n">replace</span><span class="p">(</span><span class="nb">dict</span><span class="p">(</span><span class="nb">zip</span><span class="p">(</span><span class="nb">list</span><span class="p">(</span>
<span class="gp">   .....: </span>            <span class="nb">range</span><span class="p">(</span><span class="n">df_dummies</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">0</span><span class="p">])),</span><span class="n">df_dummies</span><span class="o">.</span><span class="n">columns</span><span class="p">)))</span>
<span class="gp">   .....: </span>

<span class="gp">In [155]: </span><span class="n">res_df</span><span class="o">.</span><span class="n">loc</span><span class="p">[</span><span class="n">res_df</span><span class="o">.</span><span class="n">Employment</span><span class="o">.</span><span class="n">isna</span><span class="p">(),</span> <span class="s1">&#39;Employment&#39;</span><span class="p">]</span> <span class="o">=</span> <span class="n">cat_res</span><span class="o">.</span><span class="n">values</span>

<span class="gp">In [156]: </span><span class="n">res_df</span><span class="o">.</span><span class="n">isna</span><span class="p">()</span><span class="o">.</span><span class="n">sum</span><span class="p">()</span>
<span class="gh">Out[156]: </span>
<span class="go">ID            0</span>
<span class="go">Age           0</span>
<span class="go">Employment    0</span>
<span class="go">Marital       0</span>
<span class="go">Income        0</span>
<span class="go">Gender        0</span>
<span class="go">Hours         0</span>
<span class="go">dtype: int64</span>
</pre></div>
</div>
</section>
</section>
<section id="id17">
<h2>第八章 文本数据<a class="headerlink" href="#id17" title="Permalink to this heading">#</a></h2>
<section id="id18">
<h3>Ex1：房屋信息数据集<a class="headerlink" href="#id18" title="Permalink to this heading">#</a></h3>
<ol class="arabic simple">
<li></li>
</ol>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [157]: </span><span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_excel</span><span class="p">(</span><span class="s1">&#39;data/house_info.xls&#39;</span><span class="p">,</span> <span class="n">usecols</span><span class="o">=</span><span class="p">[</span>
<span class="gp">   .....: </span>                <span class="s1">&#39;floor&#39;</span><span class="p">,</span><span class="s1">&#39;year&#39;</span><span class="p">,</span><span class="s1">&#39;area&#39;</span><span class="p">,</span><span class="s1">&#39;price&#39;</span><span class="p">])</span>
<span class="gp">   .....: </span>

<span class="gp">In [158]: </span><span class="n">df</span><span class="o">.</span><span class="n">year</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">to_numeric</span><span class="p">(</span><span class="n">df</span><span class="o">.</span><span class="n">year</span><span class="o">.</span><span class="n">str</span><span class="p">[:</span><span class="o">-</span><span class="mi">2</span><span class="p">])</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="s1">&#39;Int64&#39;</span><span class="p">)</span>

<span class="gp">In [159]: </span><span class="n">df</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="mi">3</span><span class="p">)</span>
<span class="gh">Out[159]: </span>
<span class="go">      floor  year    area price</span>
<span class="go">0   高层（共6层）  1986  58.23㎡  155万</span>
<span class="go">1  中层（共20层）  2020     88㎡  155万</span>
<span class="go">2  低层（共28层）  2010  89.33㎡  365万</span>
</pre></div>
</div>
<ol class="arabic simple" start="2">
<li></li>
</ol>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [160]: </span><span class="n">pat</span> <span class="o">=</span> <span class="s1">&#39;(\w层)（共(\d+)层）&#39;</span>

<span class="gp">In [161]: </span><span class="n">new_cols</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">floor</span><span class="o">.</span><span class="n">str</span><span class="o">.</span><span class="n">extract</span><span class="p">(</span><span class="n">pat</span><span class="p">)</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span>
<span class="gp">   .....: </span>                    <span class="n">columns</span><span class="o">=</span><span class="p">{</span><span class="mi">0</span><span class="p">:</span><span class="s1">&#39;Level&#39;</span><span class="p">,</span> <span class="mi">1</span><span class="p">:</span><span class="s1">&#39;Highest&#39;</span><span class="p">})</span>
<span class="gp">   .....: </span>

<span class="gp">In [162]: </span><span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">concat</span><span class="p">([</span><span class="n">df</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="n">columns</span><span class="o">=</span><span class="p">[</span><span class="s1">&#39;floor&#39;</span><span class="p">]),</span> <span class="n">new_cols</span><span class="p">],</span> <span class="mi">1</span><span class="p">)</span>

<span class="gp">In [163]: </span><span class="n">df</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="mi">3</span><span class="p">)</span>
<span class="gh">Out[163]: </span>
<span class="go">   year    area price Level Highest</span>
<span class="go">0  1986  58.23㎡  155万    高层       6</span>
<span class="go">1  2020     88㎡  155万    中层      20</span>
<span class="go">2  2010  89.33㎡  365万    低层      28</span>
</pre></div>
</div>
<ol class="arabic simple" start="3">
<li></li>
</ol>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [164]: </span><span class="n">s_area</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">to_numeric</span><span class="p">(</span><span class="n">df</span><span class="o">.</span><span class="n">area</span><span class="o">.</span><span class="n">str</span><span class="p">[:</span><span class="o">-</span><span class="mi">1</span><span class="p">])</span>

<span class="gp">In [165]: </span><span class="n">s_price</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">to_numeric</span><span class="p">(</span><span class="n">df</span><span class="o">.</span><span class="n">price</span><span class="o">.</span><span class="n">str</span><span class="p">[:</span><span class="o">-</span><span class="mi">1</span><span class="p">])</span>

<span class="gp">In [166]: </span><span class="n">df</span><span class="p">[</span><span class="s1">&#39;avg_price&#39;</span><span class="p">]</span> <span class="o">=</span> <span class="p">((</span><span class="n">s_price</span><span class="o">/</span><span class="n">s_area</span><span class="p">)</span><span class="o">*</span><span class="mi">10000</span><span class="p">)</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span>
<span class="gp">   .....: </span>                    <span class="s1">&#39;int&#39;</span><span class="p">)</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="s1">&#39;string&#39;</span><span class="p">)</span> <span class="o">+</span> <span class="s1">&#39;元/平米&#39;</span>
<span class="gp">   .....: </span>

<span class="gp">In [167]: </span><span class="n">df</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="mi">3</span><span class="p">)</span>
<span class="gh">Out[167]: </span>
<span class="go">   year    area price Level Highest  avg_price</span>
<span class="go">0  1986  58.23㎡  155万    高层       6  26618元/平米</span>
<span class="go">1  2020     88㎡  155万    中层      20  17613元/平米</span>
<span class="go">2  2010  89.33㎡  365万    低层      28  40859元/平米</span>
</pre></div>
</div>
</section>
<section id="id19">
<h3>Ex2：《权力的游戏》剧本数据集<a class="headerlink" href="#id19" title="Permalink to this heading">#</a></h3>
<ol class="arabic simple">
<li></li>
</ol>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [168]: </span><span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s1">&#39;data/script.csv&#39;</span><span class="p">)</span>

<span class="gp">In [169]: </span><span class="n">df</span><span class="o">.</span><span class="n">columns</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">columns</span><span class="o">.</span><span class="n">str</span><span class="o">.</span><span class="n">strip</span><span class="p">()</span>

<span class="gp">In [170]: </span><span class="n">df</span><span class="o">.</span><span class="n">groupby</span><span class="p">([</span><span class="s1">&#39;Season&#39;</span><span class="p">,</span> <span class="s1">&#39;Episode&#39;</span><span class="p">])[</span><span class="s1">&#39;Sentence&#39;</span><span class="p">]</span><span class="o">.</span><span class="n">count</span><span class="p">()</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
<span class="gh">Out[170]: </span>
<span class="go">Season    Episode   </span>
<span class="go">Season 1  Episode 1     327</span>
<span class="go">          Episode 10    266</span>
<span class="go">          Episode 2     283</span>
<span class="go">          Episode 3     353</span>
<span class="go">          Episode 4     404</span>
<span class="go">Name: Sentence, dtype: int64</span>
</pre></div>
</div>
<ol class="arabic simple" start="2">
<li></li>
</ol>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [171]: </span><span class="n">df</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s1">&#39;Name&#39;</span><span class="p">)</span><span class="o">.</span><span class="n">Sentence</span><span class="o">.</span><span class="n">str</span><span class="o">.</span><span class="n">split</span><span class="p">()</span><span class="o">.</span><span class="n">str</span><span class="o">.</span><span class="n">len</span><span class="p">(</span>
<span class="gp">   .....: </span> <span class="p">)</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="s1">&#39;Name&#39;</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">()</span><span class="o">.</span><span class="n">sort_values</span><span class="p">(</span><span class="n">ascending</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
<span class="gp">   .....: </span>
<span class="gh">Out[171]: </span>
<span class="go">Name</span>
<span class="go">male singer          109.000000</span>
<span class="go">slave owner           77.000000</span>
<span class="go">manderly              62.000000</span>
<span class="go">lollys stokeworth     62.000000</span>
<span class="go">dothraki matron       56.666667</span>
<span class="go">Name: Sentence, dtype: float64</span>
</pre></div>
</div>
<ol class="arabic simple" start="3">
<li></li>
</ol>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [172]: </span><span class="n">s</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="n">df</span><span class="o">.</span><span class="n">Sentence</span><span class="o">.</span><span class="n">values</span><span class="p">,</span> <span class="n">index</span><span class="o">=</span><span class="n">df</span><span class="o">.</span><span class="n">Name</span><span class="o">.</span><span class="n">shift</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">))</span>

<span class="gp">In [173]: </span><span class="n">s</span><span class="o">.</span><span class="n">str</span><span class="o">.</span><span class="n">count</span><span class="p">(</span><span class="s1">&#39;\?&#39;</span><span class="p">)</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="s1">&#39;Name&#39;</span><span class="p">)</span><span class="o">.</span><span class="n">sum</span><span class="p">()</span><span class="o">.</span><span class="n">sort_values</span><span class="p">(</span><span class="n">ascending</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
<span class="gh">Out[173]: </span>
<span class="go">Name</span>
<span class="go">tyrion lannister    527</span>
<span class="go">jon snow            374</span>
<span class="go">jaime lannister     283</span>
<span class="go">arya stark          265</span>
<span class="go">cersei lannister    246</span>
<span class="go">dtype: int64</span>
</pre></div>
</div>
</section>
</section>
<section id="id20">
<h2>第九章 分类数据<a class="headerlink" href="#id20" title="Permalink to this heading">#</a></h2>
<section id="id21">
<h3>Ex1：统计未出现的类别<a class="headerlink" href="#id21" title="Permalink to this heading">#</a></h3>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [174]: </span><span class="k">def</span> <span class="nf">my_crosstab</span><span class="p">(</span><span class="n">s1</span><span class="p">,</span> <span class="n">s2</span><span class="p">,</span> <span class="n">dropna</span><span class="o">=</span><span class="kc">True</span><span class="p">):</span>
<span class="gp">   .....: </span>    <span class="n">idx1</span> <span class="o">=</span> <span class="p">(</span><span class="n">s1</span><span class="o">.</span><span class="n">cat</span><span class="o">.</span><span class="n">categories</span> <span class="k">if</span> <span class="n">s1</span><span class="o">.</span><span class="n">dtype</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s1">&#39;category&#39;</span> <span class="ow">and</span>
<span class="gp">   .....: </span>                             <span class="ow">not</span> <span class="n">dropna</span> <span class="k">else</span> <span class="n">s1</span><span class="o">.</span><span class="n">unique</span><span class="p">())</span>
<span class="gp">   .....: </span>    <span class="n">idx2</span> <span class="o">=</span> <span class="p">(</span><span class="n">s2</span><span class="o">.</span><span class="n">cat</span><span class="o">.</span><span class="n">categories</span> <span class="k">if</span> <span class="n">s2</span><span class="o">.</span><span class="n">dtype</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s1">&#39;category&#39;</span> <span class="ow">and</span>
<span class="gp">   .....: </span>                             <span class="ow">not</span> <span class="n">dropna</span> <span class="k">else</span> <span class="n">s2</span><span class="o">.</span><span class="n">unique</span><span class="p">())</span>
<span class="gp">   .....: </span>    <span class="n">res</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">zeros</span><span class="p">((</span><span class="n">idx1</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">idx2</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">0</span><span class="p">])),</span>
<span class="gp">   .....: </span>                    <span class="n">index</span><span class="o">=</span><span class="n">idx1</span><span class="p">,</span> <span class="n">columns</span><span class="o">=</span><span class="n">idx2</span><span class="p">)</span>
<span class="gp">   .....: </span>    <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">j</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">s1</span><span class="p">,</span> <span class="n">s2</span><span class="p">):</span>
<span class="gp">   .....: </span>        <span class="n">res</span><span class="o">.</span><span class="n">at</span><span class="p">[</span><span class="n">i</span><span class="p">,</span> <span class="n">j</span><span class="p">]</span> <span class="o">+=</span> <span class="mi">1</span>
<span class="gp">   .....: </span>    <span class="n">res</span> <span class="o">=</span> <span class="n">res</span><span class="o">.</span><span class="n">rename_axis</span><span class="p">(</span><span class="n">index</span><span class="o">=</span><span class="n">s1</span><span class="o">.</span><span class="n">name</span><span class="p">,</span> <span class="n">columns</span><span class="o">=</span><span class="n">s2</span><span class="o">.</span><span class="n">name</span><span class="p">)</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="s1">&#39;int&#39;</span><span class="p">)</span>
<span class="gp">   .....: </span>    <span class="k">return</span> <span class="n">res</span>
<span class="gp">   .....: </span>

<span class="gp">In [175]: </span><span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span><span class="s1">&#39;A&#39;</span><span class="p">:[</span><span class="s1">&#39;a&#39;</span><span class="p">,</span><span class="s1">&#39;b&#39;</span><span class="p">,</span><span class="s1">&#39;c&#39;</span><span class="p">,</span><span class="s1">&#39;a&#39;</span><span class="p">],</span>
<span class="gp">   .....: </span>                   <span class="s1">&#39;B&#39;</span><span class="p">:[</span><span class="s1">&#39;cat&#39;</span><span class="p">,</span><span class="s1">&#39;cat&#39;</span><span class="p">,</span><span class="s1">&#39;dog&#39;</span><span class="p">,</span><span class="s1">&#39;cat&#39;</span><span class="p">]})</span>
<span class="gp">   .....: </span>

<span class="gp">In [176]: </span><span class="n">df</span><span class="o">.</span><span class="n">B</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">B</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="s1">&#39;category&#39;</span><span class="p">)</span><span class="o">.</span><span class="n">cat</span><span class="o">.</span><span class="n">add_categories</span><span class="p">(</span><span class="s1">&#39;sheep&#39;</span><span class="p">)</span>

<span class="gp">In [177]: </span><span class="n">my_crosstab</span><span class="p">(</span><span class="n">df</span><span class="o">.</span><span class="n">A</span><span class="p">,</span> <span class="n">df</span><span class="o">.</span><span class="n">B</span><span class="p">)</span>
<span class="gh">Out[177]: </span>
<span class="go">B  cat  dog</span>
<span class="go">A          </span>
<span class="go">a    2    0</span>
<span class="go">b    1    0</span>
<span class="go">c    0    1</span>

<span class="gp">In [178]: </span><span class="n">my_crosstab</span><span class="p">(</span><span class="n">df</span><span class="o">.</span><span class="n">A</span><span class="p">,</span> <span class="n">df</span><span class="o">.</span><span class="n">B</span><span class="p">,</span> <span class="n">dropna</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
<span class="gh">Out[178]: </span>
<span class="go">B  cat  dog  sheep</span>
<span class="go">A                 </span>
<span class="go">a    2    0      0</span>
<span class="go">b    1    0      0</span>
<span class="go">c    0    1      0</span>
</pre></div>
</div>
</section>
<section id="id22">
<h3>Ex2：钻石数据集<a class="headerlink" href="#id22" title="Permalink to this heading">#</a></h3>
<ol class="arabic simple">
<li></li>
</ol>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [179]: </span><span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s1">&#39;data/diamonds.csv&#39;</span><span class="p">)</span>

<span class="gp">In [180]: </span><span class="n">s_obj</span><span class="p">,</span> <span class="n">s_cat</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">cut</span><span class="p">,</span> <span class="n">df</span><span class="o">.</span><span class="n">cut</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="s1">&#39;category&#39;</span><span class="p">)</span>
</pre></div>
</div>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [181]: </span><span class="o">%</span><span class="k">timeit</span> -n 30 s_obj.nunique()
<span class="go">4.13 ms +- 356 us per loop (mean +- std. dev. of 7 runs, 30 loops each)</span>
</pre></div>
</div>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [182]: </span><span class="o">%</span><span class="k">timeit</span> -n 30 s_cat.nunique()
<span class="go">944 us +- 153 us per loop (mean +- std. dev. of 7 runs, 30 loops each)</span>
</pre></div>
</div>
<ol class="arabic simple" start="2">
<li></li>
</ol>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [183]: </span><span class="n">df</span><span class="o">.</span><span class="n">cut</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">cut</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="s1">&#39;category&#39;</span><span class="p">)</span><span class="o">.</span><span class="n">cat</span><span class="o">.</span><span class="n">reorder_categories</span><span class="p">([</span>
<span class="gp">   .....: </span>        <span class="s1">&#39;Fair&#39;</span><span class="p">,</span> <span class="s1">&#39;Good&#39;</span><span class="p">,</span> <span class="s1">&#39;Very Good&#39;</span><span class="p">,</span> <span class="s1">&#39;Premium&#39;</span><span class="p">,</span> <span class="s1">&#39;Ideal&#39;</span><span class="p">],</span><span class="n">ordered</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="gp">   .....: </span>

<span class="gp">In [184]: </span><span class="n">df</span><span class="o">.</span><span class="n">clarity</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">clarity</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="s1">&#39;category&#39;</span><span class="p">)</span><span class="o">.</span><span class="n">cat</span><span class="o">.</span><span class="n">reorder_categories</span><span class="p">([</span>
<span class="gp">   .....: </span>        <span class="s1">&#39;I1&#39;</span><span class="p">,</span> <span class="s1">&#39;SI2&#39;</span><span class="p">,</span> <span class="s1">&#39;SI1&#39;</span><span class="p">,</span> <span class="s1">&#39;VS2&#39;</span><span class="p">,</span> <span class="s1">&#39;VS1&#39;</span><span class="p">,</span> <span class="s1">&#39;VVS2&#39;</span><span class="p">,</span> <span class="s1">&#39;VVS1&#39;</span><span class="p">,</span> <span class="s1">&#39;IF&#39;</span><span class="p">],</span><span class="n">ordered</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="gp">   .....: </span>

<span class="gp">In [185]: </span><span class="n">res</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">sort_values</span><span class="p">([</span><span class="s1">&#39;cut&#39;</span><span class="p">,</span> <span class="s1">&#39;clarity&#39;</span><span class="p">],</span> <span class="n">ascending</span><span class="o">=</span><span class="p">[</span><span class="kc">False</span><span class="p">,</span> <span class="kc">True</span><span class="p">])</span>

<span class="gp">In [186]: </span><span class="n">res</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="mi">3</span><span class="p">)</span>
<span class="gh">Out[186]: </span>
<span class="go">     carat    cut clarity  price</span>
<span class="go">315   0.96  Ideal      I1   2801</span>
<span class="go">535   0.96  Ideal      I1   2826</span>
<span class="go">551   0.97  Ideal      I1   2830</span>

<span class="gp">In [187]: </span><span class="n">res</span><span class="o">.</span><span class="n">tail</span><span class="p">(</span><span class="mi">3</span><span class="p">)</span>
<span class="gh">Out[187]: </span>
<span class="go">       carat   cut clarity  price</span>
<span class="go">47407   0.52  Fair      IF   1849</span>
<span class="go">49683   0.52  Fair      IF   2144</span>
<span class="go">50126   0.47  Fair      IF   2211</span>
</pre></div>
</div>
<ol class="arabic simple" start="3">
<li></li>
</ol>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [188]: </span><span class="n">df</span><span class="o">.</span><span class="n">cut</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">cut</span><span class="o">.</span><span class="n">cat</span><span class="o">.</span><span class="n">reorder_categories</span><span class="p">(</span>
<span class="gp">   .....: </span>        <span class="n">df</span><span class="o">.</span><span class="n">cut</span><span class="o">.</span><span class="n">cat</span><span class="o">.</span><span class="n">categories</span><span class="p">[::</span><span class="o">-</span><span class="mi">1</span><span class="p">])</span>
<span class="gp">   .....: </span>

<span class="gp">In [189]: </span><span class="n">df</span><span class="o">.</span><span class="n">clarity</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">clarity</span><span class="o">.</span><span class="n">cat</span><span class="o">.</span><span class="n">reorder_categories</span><span class="p">(</span>
<span class="gp">   .....: </span>            <span class="n">df</span><span class="o">.</span><span class="n">clarity</span><span class="o">.</span><span class="n">cat</span><span class="o">.</span><span class="n">categories</span><span class="p">[::</span><span class="o">-</span><span class="mi">1</span><span class="p">])</span>
<span class="gp">   .....: </span>

<span class="gp">In [190]: </span><span class="n">df</span><span class="o">.</span><span class="n">cut</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">cut</span><span class="o">.</span><span class="n">cat</span><span class="o">.</span><span class="n">codes</span> <span class="c1"># 方法一：利用cat.codes</span>

<span class="gp">In [191]: </span><span class="n">clarity_cat</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">clarity</span><span class="o">.</span><span class="n">cat</span><span class="o">.</span><span class="n">categories</span>

<span class="gp">In [192]: </span><span class="n">df</span><span class="o">.</span><span class="n">clarity</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">clarity</span><span class="o">.</span><span class="n">replace</span><span class="p">(</span><span class="nb">dict</span><span class="p">(</span><span class="nb">zip</span><span class="p">(</span>
<span class="gp">   .....: </span>            <span class="n">clarity_cat</span><span class="p">,</span> <span class="n">np</span><span class="o">.</span><span class="n">arange</span><span class="p">(</span>
<span class="gp">   .....: </span>                <span class="nb">len</span><span class="p">(</span><span class="n">clarity_cat</span><span class="p">)))))</span> <span class="c1"># 方法二：使用replace映射</span>
<span class="gp">   .....: </span>

<span class="gp">In [193]: </span><span class="n">df</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="mi">3</span><span class="p">)</span>
<span class="gh">Out[193]: </span>
<span class="go">   carat  cut  clarity  price</span>
<span class="go">0   0.23    0        6    326</span>
<span class="go">1   0.21    1        5    326</span>
<span class="go">2   0.23    3        3    327</span>
</pre></div>
</div>
<ol class="arabic simple" start="4">
<li></li>
</ol>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [194]: </span><span class="n">q</span> <span class="o">=</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mf">0.2</span><span class="p">,</span> <span class="mf">0.4</span><span class="p">,</span> <span class="mf">0.6</span><span class="p">,</span> <span class="mf">0.8</span><span class="p">,</span> <span class="mi">1</span><span class="p">]</span>

<span class="gp">In [195]: </span><span class="n">point</span> <span class="o">=</span> <span class="p">[</span><span class="o">-</span><span class="n">np</span><span class="o">.</span><span class="n">infty</span><span class="p">,</span> <span class="mi">1000</span><span class="p">,</span> <span class="mi">3500</span><span class="p">,</span> <span class="mi">5500</span><span class="p">,</span> <span class="mi">18000</span><span class="p">,</span> <span class="n">np</span><span class="o">.</span><span class="n">infty</span><span class="p">]</span>

<span class="gp">In [196]: </span><span class="n">avg</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">price</span> <span class="o">/</span> <span class="n">df</span><span class="o">.</span><span class="n">carat</span>

<span class="gp">In [197]: </span><span class="n">df</span><span class="p">[</span><span class="s1">&#39;avg_cut&#39;</span><span class="p">]</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">cut</span><span class="p">(</span><span class="n">avg</span><span class="p">,</span> <span class="n">bins</span><span class="o">=</span><span class="n">point</span><span class="p">,</span> <span class="n">labels</span><span class="o">=</span><span class="p">[</span>
<span class="gp">   .....: </span>                <span class="s1">&#39;Very Low&#39;</span><span class="p">,</span> <span class="s1">&#39;Low&#39;</span><span class="p">,</span> <span class="s1">&#39;Mid&#39;</span><span class="p">,</span> <span class="s1">&#39;High&#39;</span><span class="p">,</span> <span class="s1">&#39;Very High&#39;</span><span class="p">])</span>
<span class="gp">   .....: </span>

<span class="gp">In [198]: </span><span class="n">df</span><span class="p">[</span><span class="s1">&#39;avg_qcut&#39;</span><span class="p">]</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">qcut</span><span class="p">(</span><span class="n">avg</span><span class="p">,</span> <span class="n">q</span><span class="o">=</span><span class="n">q</span><span class="p">,</span> <span class="n">labels</span><span class="o">=</span><span class="p">[</span>
<span class="gp">   .....: </span>                <span class="s1">&#39;Very Low&#39;</span><span class="p">,</span> <span class="s1">&#39;Low&#39;</span><span class="p">,</span> <span class="s1">&#39;Mid&#39;</span><span class="p">,</span> <span class="s1">&#39;High&#39;</span><span class="p">,</span> <span class="s1">&#39;Very High&#39;</span><span class="p">])</span>
<span class="gp">   .....: </span>

<span class="gp">In [199]: </span><span class="n">df</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
<span class="gh">Out[199]: </span>
<span class="go">   carat  cut  clarity  price avg_cut  avg_qcut</span>
<span class="go">0   0.23    0        6    326     Low  Very Low</span>
<span class="go">1   0.21    1        5    326     Low  Very Low</span>
<span class="go">2   0.23    3        3    327     Low  Very Low</span>
<span class="go">3   0.29    1        4    334     Low  Very Low</span>
<span class="go">4   0.31    3        6    335     Low  Very Low</span>
</pre></div>
</div>
<ol class="arabic simple" start="5">
<li></li>
</ol>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [200]: </span><span class="n">df</span><span class="o">.</span><span class="n">avg_cut</span><span class="o">.</span><span class="n">unique</span><span class="p">()</span>
<span class="gh">Out[200]: </span>
<span class="go">[&#39;Low&#39;, &#39;Mid&#39;, &#39;High&#39;]</span>
<span class="go">Categories (3, object): [&#39;Low&#39; &lt; &#39;Mid&#39; &lt; &#39;High&#39;]</span>

<span class="gp">In [201]: </span><span class="n">df</span><span class="o">.</span><span class="n">avg_cut</span><span class="o">.</span><span class="n">cat</span><span class="o">.</span><span class="n">categories</span>
<span class="gh">Out[201]: </span><span class="go">Index([&#39;Very Low&#39;, &#39;Low&#39;, &#39;Mid&#39;, &#39;High&#39;, &#39;Very High&#39;], dtype=&#39;object&#39;)</span>

<span class="gp">In [202]: </span><span class="n">df</span><span class="o">.</span><span class="n">avg_cut</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">avg_cut</span><span class="o">.</span><span class="n">cat</span><span class="o">.</span><span class="n">remove_categories</span><span class="p">([</span>
<span class="gp">   .....: </span>            <span class="s1">&#39;Very Low&#39;</span><span class="p">,</span> <span class="s1">&#39;Very High&#39;</span><span class="p">])</span>
<span class="gp">   .....: </span>

<span class="gp">In [203]: </span><span class="n">df</span><span class="o">.</span><span class="n">avg_cut</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="mi">3</span><span class="p">)</span>
<span class="gh">Out[203]: </span>
<span class="go">0    Low</span>
<span class="go">1    Low</span>
<span class="go">2    Low</span>
<span class="go">Name: avg_cut, dtype: category</span>
<span class="go">Categories (3, object): [&#39;Low&#39; &lt; &#39;Mid&#39; &lt; &#39;High&#39;]</span>
</pre></div>
</div>
<ol class="arabic simple" start="6">
<li></li>
</ol>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [204]: </span><span class="n">interval_avg</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">IntervalIndex</span><span class="p">(</span><span class="n">pd</span><span class="o">.</span><span class="n">qcut</span><span class="p">(</span><span class="n">avg</span><span class="p">,</span> <span class="n">q</span><span class="o">=</span><span class="n">q</span><span class="p">))</span>

<span class="gp">In [205]: </span><span class="n">interval_avg</span><span class="o">.</span><span class="n">right</span><span class="o">.</span><span class="n">to_series</span><span class="p">()</span><span class="o">.</span><span class="n">reset_index</span><span class="p">(</span><span class="n">drop</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="mi">3</span><span class="p">)</span>
<span class="gh">Out[205]: </span>
<span class="go">0    2295.0</span>
<span class="go">1    2295.0</span>
<span class="go">2    2295.0</span>
<span class="go">dtype: float64</span>

<span class="gp">In [206]: </span><span class="n">interval_avg</span><span class="o">.</span><span class="n">left</span><span class="o">.</span><span class="n">to_series</span><span class="p">()</span><span class="o">.</span><span class="n">reset_index</span><span class="p">(</span><span class="n">drop</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="mi">3</span><span class="p">)</span>
<span class="gh">Out[206]: </span>
<span class="go">0    1051.162</span>
<span class="go">1    1051.162</span>
<span class="go">2    1051.162</span>
<span class="go">dtype: float64</span>

<span class="gp">In [207]: </span><span class="n">interval_avg</span><span class="o">.</span><span class="n">length</span><span class="o">.</span><span class="n">to_series</span><span class="p">()</span><span class="o">.</span><span class="n">reset_index</span><span class="p">(</span><span class="n">drop</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="mi">3</span><span class="p">)</span>
<span class="gh">Out[207]: </span>
<span class="go">0    1243.838</span>
<span class="go">1    1243.838</span>
<span class="go">2    1243.838</span>
<span class="go">dtype: float64</span>
</pre></div>
</div>
</section>
</section>
<section id="id23">
<h2>第十章 时序数据<a class="headerlink" href="#id23" title="Permalink to this heading">#</a></h2>
<section id="id24">
<h3>Ex1：太阳辐射数据集<a class="headerlink" href="#id24" title="Permalink to this heading">#</a></h3>
<ol class="arabic simple">
<li></li>
</ol>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [208]: </span><span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s1">&#39;data/solar.csv&#39;</span><span class="p">,</span> <span class="n">usecols</span><span class="o">=</span><span class="p">[</span><span class="s1">&#39;Data&#39;</span><span class="p">,</span><span class="s1">&#39;Time&#39;</span><span class="p">,</span>
<span class="gp">   .....: </span>                 <span class="s1">&#39;Radiation&#39;</span><span class="p">,</span><span class="s1">&#39;Temperature&#39;</span><span class="p">])</span>
<span class="gp">   .....: </span>

<span class="gp">In [209]: </span><span class="n">solar_date</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">Data</span><span class="o">.</span><span class="n">str</span><span class="o">.</span><span class="n">extract</span><span class="p">(</span><span class="s1">&#39;([/|\w]+\s).+&#39;</span><span class="p">)[</span><span class="mi">0</span><span class="p">]</span>

<span class="gp">In [210]: </span><span class="n">df</span><span class="p">[</span><span class="s1">&#39;Data&#39;</span><span class="p">]</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">to_datetime</span><span class="p">(</span><span class="n">solar_date</span> <span class="o">+</span> <span class="n">df</span><span class="o">.</span><span class="n">Time</span><span class="p">)</span>

<span class="gp">In [211]: </span><span class="n">df</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="n">columns</span><span class="o">=</span><span class="s1">&#39;Time&#39;</span><span class="p">)</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">columns</span><span class="o">=</span><span class="p">{</span><span class="s1">&#39;Data&#39;</span><span class="p">:</span><span class="s1">&#39;Datetime&#39;</span><span class="p">}</span>
<span class="gp">   .....: </span>            <span class="p">)</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s1">&#39;Datetime&#39;</span><span class="p">)</span><span class="o">.</span><span class="n">sort_index</span><span class="p">()</span>
<span class="gp">   .....: </span>

<span class="gp">In [212]: </span><span class="n">df</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="mi">3</span><span class="p">)</span>
<span class="gh">Out[212]: </span>
<span class="go">                     Radiation  Temperature</span>
<span class="go">Datetime                                   </span>
<span class="go">2016-09-01 00:00:08       2.58           51</span>
<span class="go">2016-09-01 00:05:10       2.83           51</span>
<span class="go">2016-09-01 00:20:06       2.16           51</span>
</pre></div>
</div>
<ol class="arabic simple" start="2">
<li></li>
</ol>
<ol class="loweralpha simple">
<li></li>
</ol>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [213]: </span><span class="n">s</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">index</span><span class="o">.</span><span class="n">to_series</span><span class="p">()</span><span class="o">.</span><span class="n">reset_index</span><span class="p">(</span><span class="n">drop</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span><span class="o">.</span><span class="n">diff</span><span class="p">()</span><span class="o">.</span><span class="n">dt</span><span class="o">.</span><span class="n">total_seconds</span><span class="p">()</span>

<span class="gp">In [214]: </span><span class="n">max_3</span> <span class="o">=</span> <span class="n">s</span><span class="o">.</span><span class="n">nlargest</span><span class="p">(</span><span class="mi">3</span><span class="p">)</span><span class="o">.</span><span class="n">index</span>

<span class="gp">In [215]: </span><span class="n">df</span><span class="o">.</span><span class="n">index</span><span class="p">[</span><span class="n">max_3</span><span class="o">.</span><span class="n">union</span><span class="p">(</span><span class="n">max_3</span><span class="o">-</span><span class="mi">1</span><span class="p">)]</span>
<span class="gh">Out[215]: </span>
<span class="go">DatetimeIndex([&#39;2016-09-29 23:55:26&#39;, &#39;2016-10-01 00:00:19&#39;,</span>
<span class="go">               &#39;2016-11-29 19:05:02&#39;, &#39;2016-12-01 00:00:02&#39;,</span>
<span class="go">               &#39;2016-12-05 20:45:53&#39;, &#39;2016-12-08 11:10:42&#39;],</span>
<span class="go">              dtype=&#39;datetime64[ns]&#39;, name=&#39;Datetime&#39;, freq=None)</span>
</pre></div>
</div>
<ol class="loweralpha simple" start="2">
<li></li>
</ol>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [216]: </span><span class="n">res</span> <span class="o">=</span> <span class="n">s</span><span class="o">.</span><span class="n">mask</span><span class="p">((</span><span class="n">s</span><span class="o">&gt;</span><span class="n">s</span><span class="o">.</span><span class="n">quantile</span><span class="p">(</span><span class="mf">0.99</span><span class="p">))</span><span class="o">|</span><span class="p">(</span><span class="n">s</span><span class="o">&lt;</span><span class="n">s</span><span class="o">.</span><span class="n">quantile</span><span class="p">(</span><span class="mf">0.01</span><span class="p">)))</span>

<span class="gp">In [217]: </span><span class="n">_</span> <span class="o">=</span> <span class="n">plt</span><span class="o">.</span><span class="n">hist</span><span class="p">(</span><span class="n">res</span><span class="p">,</span> <span class="n">bins</span><span class="o">=</span><span class="mi">50</span><span class="p">)</span>
</pre></div>
</div>
<a class="reference internal image-reference" href="../_images/ch10_ex1.png"><img alt="../_images/ch10_ex1.png" src="../_images/ch10_ex1.png" style="width: 400px;" /></a>
<ol class="arabic simple" start="3">
<li></li>
</ol>
<ol class="loweralpha simple">
<li></li>
</ol>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [218]: </span><span class="n">res</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">Radiation</span><span class="o">.</span><span class="n">rolling</span><span class="p">(</span><span class="s1">&#39;6H&#39;</span><span class="p">)</span><span class="o">.</span><span class="n">corr</span><span class="p">(</span><span class="n">df</span><span class="o">.</span><span class="n">Temperature</span><span class="p">)</span>

<span class="gp">In [219]: </span><span class="n">res</span><span class="o">.</span><span class="n">tail</span><span class="p">(</span><span class="mi">3</span><span class="p">)</span>
<span class="gh">Out[219]: </span>
<span class="go">Datetime</span>
<span class="go">2016-12-31 23:45:04    0.328574</span>
<span class="go">2016-12-31 23:50:03    0.261883</span>
<span class="go">2016-12-31 23:55:01    0.262406</span>
<span class="go">dtype: float64</span>
</pre></div>
</div>
<ol class="loweralpha simple" start="2">
<li></li>
</ol>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [220]: </span><span class="n">res</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">Temperature</span><span class="o">.</span><span class="n">resample</span><span class="p">(</span><span class="s1">&#39;6H&#39;</span><span class="p">,</span> <span class="n">origin</span><span class="o">=</span><span class="s1">&#39;03:00:00&#39;</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">()</span>

<span class="gp">In [221]: </span><span class="n">res</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="mi">3</span><span class="p">)</span>
<span class="gh">Out[221]: </span>
<span class="go">Datetime</span>
<span class="go">2016-08-31 21:00:00    51.218750</span>
<span class="go">2016-09-01 03:00:00    50.033333</span>
<span class="go">2016-09-01 09:00:00    59.379310</span>
<span class="go">Freq: 6H, Name: Temperature, dtype: float64</span>
</pre></div>
</div>
<ol class="loweralpha simple" start="3">
<li></li>
</ol>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># 非常慢</span>
<span class="n">my_dt</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">index</span><span class="o">.</span><span class="n">shift</span><span class="p">(</span><span class="n">freq</span><span class="o">=</span><span class="s1">&#39;-6H&#39;</span><span class="p">)</span>
<span class="n">int_loc</span> <span class="o">=</span> <span class="p">[</span><span class="n">df</span><span class="o">.</span><span class="n">index</span><span class="o">.</span><span class="n">get_indexer</span><span class="p">([</span><span class="n">i</span><span class="p">],</span> <span class="n">method</span><span class="o">=</span><span class="s1">&#39;nearest&#39;</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="n">my_dt</span><span class="p">]</span>
<span class="n">int_loc</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="n">int_loc</span><span class="p">)</span><span class="o">.</span><span class="n">reshape</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span>
<span class="n">res</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">Radiation</span><span class="o">.</span><span class="n">iloc</span><span class="p">[</span><span class="n">int_loc</span><span class="p">]</span>
<span class="n">res</span><span class="o">.</span><span class="n">index</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">index</span>
<span class="n">res</span><span class="o">.</span><span class="n">tail</span><span class="p">(</span><span class="mi">3</span><span class="p">)</span>
</pre></div>
</div>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="go"># 纸质版上介绍了merge_asof，性能差距可以达到3-4个数量级</span>
<span class="gp">In [222]: </span><span class="n">target</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">(</span>
<span class="gp">   .....: </span>    <span class="p">{</span>
<span class="gp">   .....: </span>        <span class="s2">&quot;Time&quot;</span><span class="p">:</span> <span class="n">df</span><span class="o">.</span><span class="n">index</span><span class="o">.</span><span class="n">shift</span><span class="p">(</span><span class="n">freq</span><span class="o">=</span><span class="s1">&#39;-6H&#39;</span><span class="p">),</span>
<span class="gp">   .....: </span>        <span class="s2">&quot;Datetime&quot;</span><span class="p">:</span> <span class="n">df</span><span class="o">.</span><span class="n">index</span><span class="p">,</span>
<span class="gp">   .....: </span>    <span class="p">}</span>
<span class="gp">   .....: </span><span class="p">)</span>
<span class="gp">   .....: </span>

<span class="gp">In [223]: </span><span class="n">res</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">merge_asof</span><span class="p">(</span>
<span class="gp">   .....: </span>    <span class="n">target</span><span class="p">,</span>
<span class="gp">   .....: </span>    <span class="n">df</span><span class="o">.</span><span class="n">reset_index</span><span class="p">()</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">columns</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;Datetime&quot;</span><span class="p">:</span> <span class="s2">&quot;Time&quot;</span><span class="p">}),</span>
<span class="gp">   .....: </span>    <span class="n">left_on</span><span class="o">=</span><span class="s2">&quot;Time&quot;</span><span class="p">,</span>
<span class="gp">   .....: </span>    <span class="n">right_on</span><span class="o">=</span><span class="s2">&quot;Time&quot;</span><span class="p">,</span>
<span class="gp">   .....: </span>    <span class="n">direction</span><span class="o">=</span><span class="s2">&quot;nearest&quot;</span>
<span class="gp">   .....: </span><span class="p">)</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;Datetime&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">Radiation</span>
<span class="gp">   .....: </span>

<span class="gp">In [224]: </span><span class="n">res</span><span class="o">.</span><span class="n">tail</span><span class="p">(</span><span class="mi">3</span><span class="p">)</span>
<span class="gh">Out[224]: </span>
<span class="go">Datetime</span>
<span class="go">2016-12-31 23:45:04    9.33</span>
<span class="go">2016-12-31 23:50:03    8.49</span>
<span class="go">2016-12-31 23:55:01    5.84</span>
<span class="go">Name: Radiation, dtype: float64</span>
</pre></div>
</div>
</section>
<section id="id25">
<h3>Ex2：水果销量数据集<a class="headerlink" href="#id25" title="Permalink to this heading">#</a></h3>
<ol class="arabic simple">
<li></li>
</ol>
<ol class="loweralpha simple">
<li></li>
</ol>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [225]: </span><span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s1">&#39;data/fruit.csv&#39;</span><span class="p">)</span>

<span class="gp">In [226]: </span><span class="n">df</span><span class="o">.</span><span class="n">Date</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">to_datetime</span><span class="p">(</span><span class="n">df</span><span class="o">.</span><span class="n">Date</span><span class="p">)</span>

<span class="gp">In [227]: </span><span class="n">df_grape</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">query</span><span class="p">(</span><span class="s2">&quot;Fruit == &#39;Grape&#39;&quot;</span><span class="p">)</span>

<span class="gp">In [228]: </span><span class="n">res</span> <span class="o">=</span> <span class="n">df_grape</span><span class="o">.</span><span class="n">groupby</span><span class="p">([</span><span class="n">np</span><span class="o">.</span><span class="n">where</span><span class="p">(</span><span class="n">df_grape</span><span class="o">.</span><span class="n">Date</span><span class="o">.</span><span class="n">dt</span><span class="o">.</span><span class="n">day</span><span class="o">&lt;=</span><span class="mi">15</span><span class="p">,</span>
<span class="gp">   .....: </span>                        <span class="s1">&#39;First&#39;</span><span class="p">,</span> <span class="s1">&#39;Second&#39;</span><span class="p">),</span><span class="n">df_grape</span><span class="o">.</span><span class="n">Date</span><span class="o">.</span><span class="n">dt</span><span class="o">.</span><span class="n">month</span><span class="p">]</span>
<span class="gp">   .....: </span>                        <span class="p">)[</span><span class="s1">&#39;Sale&#39;</span><span class="p">]</span><span class="o">.</span><span class="n">mean</span><span class="p">()</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span><span class="o">.</span><span class="n">unstack</span><span class="p">(</span><span class="mi">0</span>
<span class="gp">   .....: </span>                        <span class="p">)</span><span class="o">.</span><span class="n">droplevel</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="n">axis</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
<span class="gp">   .....: </span>

<span class="gp">In [229]: </span><span class="n">res</span> <span class="o">=</span> <span class="p">(</span><span class="n">res</span><span class="o">.</span><span class="n">First</span><span class="o">/</span><span class="n">res</span><span class="o">.</span><span class="n">Second</span><span class="p">)</span><span class="o">.</span><span class="n">rename_axis</span><span class="p">(</span><span class="s1">&#39;Month&#39;</span><span class="p">)</span>

<span class="gp">In [230]: </span><span class="n">res</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
<span class="gh">Out[230]: </span>
<span class="go">Month</span>
<span class="go">1    1.174998</span>
<span class="go">2    0.968890</span>
<span class="go">3    0.951351</span>
<span class="go">4    1.020797</span>
<span class="go">5    0.931061</span>
<span class="go">dtype: float64</span>
</pre></div>
</div>
<ol class="loweralpha simple" start="2">
<li></li>
</ol>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [231]: </span><span class="n">df</span><span class="p">[</span><span class="n">df</span><span class="o">.</span><span class="n">Date</span><span class="o">.</span><span class="n">dt</span><span class="o">.</span><span class="n">is_month_end</span><span class="p">]</span><span class="o">.</span><span class="n">query</span><span class="p">(</span><span class="s2">&quot;Fruit == &#39;Pear&#39;&quot;</span>
<span class="gp">   .....: </span>                          <span class="p">)</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="s1">&#39;Date&#39;</span><span class="p">)</span><span class="o">.</span><span class="n">Sale</span><span class="o">.</span><span class="n">sum</span><span class="p">()</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
<span class="gp">   .....: </span>
<span class="gh">Out[231]: </span>
<span class="go">Date</span>
<span class="go">2019-01-31    847</span>
<span class="go">2019-02-28    774</span>
<span class="go">2019-03-31    761</span>
<span class="go">2019-04-30    648</span>
<span class="go">2019-05-31    616</span>
<span class="go">Name: Sale, dtype: int64</span>
</pre></div>
</div>
<ol class="loweralpha simple" start="3">
<li></li>
</ol>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [232]: </span><span class="n">df</span><span class="p">[</span><span class="n">df</span><span class="o">.</span><span class="n">Date</span><span class="o">.</span><span class="n">isin</span><span class="p">(</span><span class="n">pd</span><span class="o">.</span><span class="n">date_range</span><span class="p">(</span><span class="s1">&#39;20190101&#39;</span><span class="p">,</span> <span class="s1">&#39;20191231&#39;</span><span class="p">,</span>
<span class="gp">   .....: </span>                <span class="n">freq</span><span class="o">=</span><span class="s1">&#39;BM&#39;</span><span class="p">))]</span><span class="o">.</span><span class="n">query</span><span class="p">(</span><span class="s2">&quot;Fruit == &#39;Pear&#39;&quot;</span>
<span class="gp">   .....: </span>                <span class="p">)</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="s1">&#39;Date&#39;</span><span class="p">)</span><span class="o">.</span><span class="n">Sale</span><span class="o">.</span><span class="n">sum</span><span class="p">()</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
<span class="gp">   .....: </span>
<span class="gh">Out[232]: </span>
<span class="go">Date</span>
<span class="go">2019-01-31    847</span>
<span class="go">2019-02-28    774</span>
<span class="go">2019-03-29    510</span>
<span class="go">2019-04-30    648</span>
<span class="go">2019-05-31    616</span>
<span class="go">Name: Sale, dtype: int64</span>
</pre></div>
</div>
<ol class="loweralpha simple" start="4">
<li></li>
</ol>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [233]: </span><span class="n">target_dt</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">drop_duplicates</span><span class="p">()</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="n">df</span><span class="o">.</span><span class="n">Date</span><span class="o">.</span><span class="n">drop_duplicates</span><span class="p">(</span>
<span class="gp">   .....: </span>            <span class="p">)</span><span class="o">.</span><span class="n">dt</span><span class="o">.</span><span class="n">month</span><span class="p">)[</span><span class="s1">&#39;Date&#39;</span><span class="p">]</span><span class="o">.</span><span class="n">nlargest</span><span class="p">(</span><span class="mi">5</span><span class="p">)</span><span class="o">.</span><span class="n">reset_index</span><span class="p">(</span><span class="n">drop</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="gp">   .....: </span>

<span class="gp">In [234]: </span><span class="n">res</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s1">&#39;Date&#39;</span><span class="p">)</span><span class="o">.</span><span class="n">loc</span><span class="p">[</span><span class="n">target_dt</span><span class="p">]</span><span class="o">.</span><span class="n">reset_index</span><span class="p">(</span>
<span class="gp">   .....: </span>            <span class="p">)</span><span class="o">.</span><span class="n">query</span><span class="p">(</span><span class="s2">&quot;Fruit == &#39;Apple&#39;&quot;</span><span class="p">)</span>
<span class="gp">   .....: </span>

<span class="gp">In [235]: </span><span class="n">res</span> <span class="o">=</span> <span class="n">res</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="n">res</span><span class="o">.</span><span class="n">Date</span><span class="o">.</span><span class="n">dt</span><span class="o">.</span><span class="n">month</span><span class="p">)[</span><span class="s1">&#39;Sale&#39;</span><span class="p">]</span><span class="o">.</span><span class="n">mean</span><span class="p">(</span>
<span class="gp">   .....: </span>            <span class="p">)</span><span class="o">.</span><span class="n">rename_axis</span><span class="p">(</span><span class="s1">&#39;Month&#39;</span><span class="p">)</span>
<span class="gp">   .....: </span>

<span class="gp">In [236]: </span><span class="n">res</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
<span class="gh">Out[236]: </span>
<span class="go">Month</span>
<span class="go">1    65.313725</span>
<span class="go">2    54.061538</span>
<span class="go">3    59.325581</span>
<span class="go">4    65.795455</span>
<span class="go">5    57.465116</span>
<span class="go">Name: Sale, dtype: float64</span>
</pre></div>
</div>
<ol class="arabic simple" start="2">
<li></li>
</ol>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [237]: </span><span class="n">month_order</span> <span class="o">=</span> <span class="p">[</span><span class="s1">&#39;January&#39;</span><span class="p">,</span><span class="s1">&#39;February&#39;</span><span class="p">,</span><span class="s1">&#39;March&#39;</span><span class="p">,</span><span class="s1">&#39;April&#39;</span><span class="p">,</span>
<span class="gp">   .....: </span>                <span class="s1">&#39;May&#39;</span><span class="p">,</span><span class="s1">&#39;June&#39;</span><span class="p">,</span><span class="s1">&#39;July&#39;</span><span class="p">,</span><span class="s1">&#39;August&#39;</span><span class="p">,</span><span class="s1">&#39;September&#39;</span><span class="p">,</span>
<span class="gp">   .....: </span>                <span class="s1">&#39;October&#39;</span><span class="p">,</span><span class="s1">&#39;November&#39;</span><span class="p">,</span><span class="s1">&#39;December&#39;</span><span class="p">]</span>
<span class="gp">   .....: </span>

<span class="gp">In [238]: </span><span class="n">week_order</span> <span class="o">=</span> <span class="p">[</span><span class="s1">&#39;Mon&#39;</span><span class="p">,</span><span class="s1">&#39;Tue&#39;</span><span class="p">,</span><span class="s1">&#39;Wed&#39;</span><span class="p">,</span><span class="s1">&#39;Thu&#39;</span><span class="p">,</span><span class="s1">&#39;Fri&#39;</span><span class="p">,</span><span class="s1">&#39;Sat&#39;</span><span class="p">,</span><span class="s1">&#39;Sum&#39;</span><span class="p">]</span>

<span class="gp">In [239]: </span><span class="n">group1</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">Date</span><span class="o">.</span><span class="n">dt</span><span class="o">.</span><span class="n">month_name</span><span class="p">()</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="s1">&#39;category&#39;</span><span class="p">)</span><span class="o">.</span><span class="n">cat</span><span class="o">.</span><span class="n">reorder_categories</span><span class="p">(</span>
<span class="gp">   .....: </span>        <span class="n">month_order</span><span class="p">,</span> <span class="n">ordered</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="gp">   .....: </span>

<span class="gp">In [240]: </span><span class="n">group2</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">Fruit</span>

<span class="gp">In [241]: </span><span class="n">group3</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">Date</span><span class="o">.</span><span class="n">dt</span><span class="o">.</span><span class="n">dayofweek</span><span class="o">.</span><span class="n">replace</span><span class="p">(</span><span class="nb">dict</span><span class="p">(</span><span class="nb">zip</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="mi">7</span><span class="p">),</span><span class="n">week_order</span><span class="p">))</span>
<span class="gp">   .....: </span>         <span class="p">)</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="s1">&#39;category&#39;</span><span class="p">)</span><span class="o">.</span><span class="n">cat</span><span class="o">.</span><span class="n">reorder_categories</span><span class="p">(</span>
<span class="gp">   .....: </span>         <span class="n">week_order</span><span class="p">,</span> <span class="n">ordered</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="gp">   .....: </span>

<span class="gp">In [242]: </span><span class="n">res</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">groupby</span><span class="p">([</span><span class="n">group1</span><span class="p">,</span> <span class="n">group2</span><span class="p">,</span><span class="n">group3</span><span class="p">])[</span><span class="s1">&#39;Sale&#39;</span><span class="p">]</span><span class="o">.</span><span class="n">count</span><span class="p">()</span><span class="o">.</span><span class="n">to_frame</span><span class="p">(</span>
<span class="gp">   .....: </span>         <span class="p">)</span><span class="o">.</span><span class="n">unstack</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span><span class="o">.</span><span class="n">droplevel</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="n">axis</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
<span class="gp">   .....: </span>

<span class="gp">In [243]: </span><span class="n">res</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
<span class="gh">Out[243]: </span>
<span class="go">Date        January  February  March  April  May  June  July  August  September  October  November  December</span>
<span class="go">Fruit Date                                                                                                  </span>
<span class="go">Apple Mon        46        43     43     47   43    40    41      38         59       42        39        45</span>
<span class="go">      Tue        50        40     44     52   46    39    50      42         40       57        47        47</span>
<span class="go">      Wed        50        47     37     43   39    39    58      43         35       46        47        38</span>
<span class="go">      Thu        45        35     31     47   58    33    52      44         36       63        37        40</span>
<span class="go">      Fri        32        33     52     31   46    38    37      48         34       37        46        41</span>
</pre></div>
</div>
<ol class="arabic simple" start="3">
<li></li>
</ol>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [244]: </span><span class="n">df_apple</span> <span class="o">=</span> <span class="n">df</span><span class="p">[(</span><span class="n">df</span><span class="o">.</span><span class="n">Fruit</span><span class="o">==</span><span class="s1">&#39;Apple&#39;</span><span class="p">)</span><span class="o">&amp;</span><span class="p">(</span>
<span class="gp">   .....: </span>              <span class="o">~</span><span class="n">df</span><span class="o">.</span><span class="n">Date</span><span class="o">.</span><span class="n">dt</span><span class="o">.</span><span class="n">dayofweek</span><span class="o">.</span><span class="n">isin</span><span class="p">([</span><span class="mi">5</span><span class="p">,</span><span class="mi">6</span><span class="p">]))]</span>
<span class="gp">   .....: </span>

<span class="gp">In [245]: </span><span class="n">s</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="n">df_apple</span><span class="o">.</span><span class="n">Sale</span><span class="o">.</span><span class="n">values</span><span class="p">,</span>
<span class="gp">   .....: </span>              <span class="n">index</span><span class="o">=</span><span class="n">df_apple</span><span class="o">.</span><span class="n">Date</span><span class="p">)</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="s1">&#39;Date&#39;</span><span class="p">)</span><span class="o">.</span><span class="n">sum</span><span class="p">()</span>
<span class="gp">   .....: </span>

<span class="gp">In [246]: </span><span class="n">res</span> <span class="o">=</span> <span class="n">s</span><span class="o">.</span><span class="n">rolling</span><span class="p">(</span><span class="s1">&#39;10D&#39;</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">()</span><span class="o">.</span><span class="n">reindex</span><span class="p">(</span>
<span class="gp">   .....: </span>              <span class="n">pd</span><span class="o">.</span><span class="n">date_range</span><span class="p">(</span><span class="s1">&#39;20190101&#39;</span><span class="p">,</span><span class="s1">&#39;20191231&#39;</span><span class="p">))</span><span class="o">.</span><span class="n">fillna</span><span class="p">(</span><span class="n">method</span><span class="o">=</span><span class="s1">&#39;ffill&#39;</span><span class="p">)</span>
<span class="gp">   .....: </span>

<span class="gp">In [247]: </span><span class="n">res</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
<span class="gh">Out[247]: </span>
<span class="go">2019-01-01    189.000000</span>
<span class="go">2019-01-02    335.500000</span>
<span class="go">2019-01-03    520.333333</span>
<span class="go">2019-01-04    527.750000</span>
<span class="go">2019-01-05    527.750000</span>
<span class="go">Freq: D, dtype: float64</span>
</pre></div>
</div>
</section>
</section>
</section>


              </article>
              

              
          </div>
          
      </div>
    </div>

  
  
  <!-- Scripts loaded after <body> so the DOM is not blocked -->
  <script src="../_static/scripts/pydata-sphinx-theme.js?digest=92025949c220c2e29695"></script>

<footer class="bd-footer"><div class="bd-footer__inner container">
  
  <div class="footer-item">
    <p class="copyright">
    &copy; Copyright 2020-2022, Datawhale, 耿远昊.<br>
</p>
  </div>
  
  <div class="footer-item">
    <p class="sphinx-version">
Created using <a href="http://sphinx-doc.org/">Sphinx</a> 5.0.2.<br>
</p>
  </div>
  
</div>
</footer>
  </body>
</html>